@rankcli/agent-runtime 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/analyzer-2CSWIQGD.mjs +6 -0
- package/dist/chunk-YNZYHEYM.mjs +774 -0
- package/dist/index.d.mts +4012 -0
- package/dist/index.d.ts +4012 -0
- package/dist/index.js +29672 -0
- package/dist/index.mjs +28602 -0
- package/package.json +53 -0
- package/scripts/build-deno.ts +134 -0
- package/src/audit/ai/analyzer.ts +347 -0
- package/src/audit/ai/index.ts +29 -0
- package/src/audit/ai/prompts/content-analysis.ts +271 -0
- package/src/audit/ai/types.ts +179 -0
- package/src/audit/checks/additional-checks.ts +439 -0
- package/src/audit/checks/ai-citation-worthiness.ts +399 -0
- package/src/audit/checks/ai-content-structure.ts +325 -0
- package/src/audit/checks/ai-readiness.ts +339 -0
- package/src/audit/checks/anchor-text.ts +179 -0
- package/src/audit/checks/answer-conciseness.ts +322 -0
- package/src/audit/checks/asset-minification.ts +270 -0
- package/src/audit/checks/bing-optimization.ts +206 -0
- package/src/audit/checks/brand-mention-optimization.ts +349 -0
- package/src/audit/checks/caching-headers.ts +305 -0
- package/src/audit/checks/canonical-advanced.ts +150 -0
- package/src/audit/checks/canonical-domain.ts +196 -0
- package/src/audit/checks/citation-quality.ts +358 -0
- package/src/audit/checks/client-rendering.ts +542 -0
- package/src/audit/checks/color-contrast.ts +342 -0
- package/src/audit/checks/content-freshness.ts +170 -0
- package/src/audit/checks/content-science.ts +589 -0
- package/src/audit/checks/conversion-elements.ts +526 -0
- package/src/audit/checks/crawlability.ts +220 -0
- package/src/audit/checks/directory-listing.ts +172 -0
- package/src/audit/checks/dom-analysis.ts +191 -0
- package/src/audit/checks/dom-size.ts +246 -0
- package/src/audit/checks/duplicate-content.ts +194 -0
- package/src/audit/checks/eeat-signals.ts +990 -0
- package/src/audit/checks/entity-seo.ts +396 -0
- package/src/audit/checks/featured-snippet.ts +473 -0
- package/src/audit/checks/freshness-signals.ts +443 -0
- package/src/audit/checks/funnel-intent.ts +463 -0
- package/src/audit/checks/hreflang.ts +174 -0
- package/src/audit/checks/html-compliance.ts +302 -0
- package/src/audit/checks/image-dimensions.ts +167 -0
- package/src/audit/checks/images.ts +160 -0
- package/src/audit/checks/indexnow.ts +275 -0
- package/src/audit/checks/interactive-tools.ts +475 -0
- package/src/audit/checks/internal-link-graph.ts +436 -0
- package/src/audit/checks/keyword-analysis.ts +239 -0
- package/src/audit/checks/keyword-cannibalization.ts +385 -0
- package/src/audit/checks/keyword-placement.ts +471 -0
- package/src/audit/checks/links.ts +203 -0
- package/src/audit/checks/llms-txt.ts +224 -0
- package/src/audit/checks/local-seo.ts +296 -0
- package/src/audit/checks/mobile.ts +167 -0
- package/src/audit/checks/modern-images.ts +226 -0
- package/src/audit/checks/navboost-signals.ts +395 -0
- package/src/audit/checks/on-page.ts +209 -0
- package/src/audit/checks/page-resources.ts +285 -0
- package/src/audit/checks/pagination.ts +180 -0
- package/src/audit/checks/performance.ts +153 -0
- package/src/audit/checks/platform-presence.ts +580 -0
- package/src/audit/checks/redirect-analysis.ts +153 -0
- package/src/audit/checks/redirect-chain.ts +389 -0
- package/src/audit/checks/resource-hints.ts +420 -0
- package/src/audit/checks/responsive-css.ts +247 -0
- package/src/audit/checks/responsive-images.ts +396 -0
- package/src/audit/checks/review-ecosystem.ts +415 -0
- package/src/audit/checks/robots-validation.ts +373 -0
- package/src/audit/checks/security-headers.ts +172 -0
- package/src/audit/checks/security.ts +144 -0
- package/src/audit/checks/serp-preview.ts +251 -0
- package/src/audit/checks/site-maturity.ts +444 -0
- package/src/audit/checks/social-meta.test.ts +275 -0
- package/src/audit/checks/social-meta.ts +134 -0
- package/src/audit/checks/soft-404.ts +151 -0
- package/src/audit/checks/structured-data.ts +238 -0
- package/src/audit/checks/tech-detection.ts +496 -0
- package/src/audit/checks/topical-clusters.ts +435 -0
- package/src/audit/checks/tracker-bloat.ts +462 -0
- package/src/audit/checks/tracking-verification.test.ts +371 -0
- package/src/audit/checks/tracking-verification.ts +636 -0
- package/src/audit/checks/url-safety.ts +682 -0
- package/src/audit/deno-entry.ts +66 -0
- package/src/audit/discovery/index.ts +15 -0
- package/src/audit/discovery/link-crawler.ts +232 -0
- package/src/audit/discovery/repo-routes.ts +347 -0
- package/src/audit/engine.ts +620 -0
- package/src/audit/fixes/index.ts +209 -0
- package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
- package/src/audit/fixes/social-meta-fixes.ts +463 -0
- package/src/audit/index.ts +74 -0
- package/src/audit/runner.test.ts +299 -0
- package/src/audit/runner.ts +130 -0
- package/src/audit/types.ts +1953 -0
- package/src/content/featured-snippet.ts +367 -0
- package/src/content/generator.test.ts +534 -0
- package/src/content/generator.ts +501 -0
- package/src/content/headline.ts +317 -0
- package/src/content/index.ts +62 -0
- package/src/content/intent.ts +258 -0
- package/src/content/keyword-density.ts +349 -0
- package/src/content/readability.ts +262 -0
- package/src/executor.ts +336 -0
- package/src/fixer.ts +416 -0
- package/src/frameworks/detector.test.ts +248 -0
- package/src/frameworks/detector.ts +371 -0
- package/src/frameworks/index.ts +68 -0
- package/src/frameworks/recipes/angular.yaml +171 -0
- package/src/frameworks/recipes/astro.yaml +206 -0
- package/src/frameworks/recipes/django.yaml +180 -0
- package/src/frameworks/recipes/laravel.yaml +137 -0
- package/src/frameworks/recipes/nextjs.yaml +268 -0
- package/src/frameworks/recipes/nuxt.yaml +175 -0
- package/src/frameworks/recipes/rails.yaml +188 -0
- package/src/frameworks/recipes/react.yaml +202 -0
- package/src/frameworks/recipes/sveltekit.yaml +154 -0
- package/src/frameworks/recipes/vue.yaml +137 -0
- package/src/frameworks/recipes/wordpress.yaml +209 -0
- package/src/frameworks/suggestion-engine.ts +320 -0
- package/src/geo/geo-content.test.ts +305 -0
- package/src/geo/geo-content.ts +266 -0
- package/src/geo/geo-history.test.ts +473 -0
- package/src/geo/geo-history.ts +433 -0
- package/src/geo/geo-tracker.test.ts +359 -0
- package/src/geo/geo-tracker.ts +411 -0
- package/src/geo/index.ts +10 -0
- package/src/git/commit-helper.test.ts +261 -0
- package/src/git/commit-helper.ts +329 -0
- package/src/git/index.ts +12 -0
- package/src/git/pr-helper.test.ts +284 -0
- package/src/git/pr-helper.ts +307 -0
- package/src/index.ts +66 -0
- package/src/keywords/ai-keyword-engine.ts +1062 -0
- package/src/keywords/ai-summarizer.ts +387 -0
- package/src/keywords/ci-mode.ts +555 -0
- package/src/keywords/engine.ts +359 -0
- package/src/keywords/index.ts +151 -0
- package/src/keywords/llm-judge.ts +357 -0
- package/src/keywords/nlp-analysis.ts +706 -0
- package/src/keywords/prioritizer.ts +295 -0
- package/src/keywords/site-crawler.ts +342 -0
- package/src/keywords/sources/autocomplete.ts +139 -0
- package/src/keywords/sources/competitive-search.ts +450 -0
- package/src/keywords/sources/competitor-analysis.ts +374 -0
- package/src/keywords/sources/dataforseo.ts +206 -0
- package/src/keywords/sources/free-sources.ts +294 -0
- package/src/keywords/sources/gsc.ts +123 -0
- package/src/keywords/topic-grouping.ts +327 -0
- package/src/keywords/types.ts +144 -0
- package/src/keywords/wizard.ts +457 -0
- package/src/loader.ts +40 -0
- package/src/reports/index.ts +7 -0
- package/src/reports/report-generator.test.ts +293 -0
- package/src/reports/report-generator.ts +713 -0
- package/src/scheduler/alerts.test.ts +458 -0
- package/src/scheduler/alerts.ts +328 -0
- package/src/scheduler/index.ts +8 -0
- package/src/scheduler/scheduled-audit.test.ts +377 -0
- package/src/scheduler/scheduled-audit.ts +149 -0
- package/src/test/integration-test.ts +325 -0
- package/src/tools/analyzer.ts +373 -0
- package/src/tools/crawl.ts +293 -0
- package/src/tools/files.ts +301 -0
- package/src/tools/h1-fixer.ts +249 -0
- package/src/tools/index.ts +67 -0
- package/src/tracking/github-action.ts +326 -0
- package/src/tracking/google-analytics.ts +265 -0
- package/src/tracking/index.ts +45 -0
- package/src/tracking/report-generator.ts +386 -0
- package/src/tracking/search-console.ts +335 -0
- package/src/types.ts +134 -0
- package/src/utils/http.ts +302 -0
- package/src/wasm-adapter.ts +297 -0
- package/src/wasm-entry.ts +14 -0
- package/tsconfig.json +17 -0
- package/tsup.wasm.config.ts +26 -0
- package/vitest.config.ts +15 -0
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
// Content Science Module - Novel SEO Checks Based on Academic Research
|
|
2
|
+
// Implements: Zipf's Law, Shannon Entropy, BM25 Scoring, Cosine Similarity
|
|
3
|
+
// These techniques differentiate RankCLI from competitors like Ahrefs/SEMrush
|
|
4
|
+
|
|
5
|
+
import * as cheerio from 'cheerio';
|
|
6
|
+
import type { AuditIssue } from '../types.js';
|
|
7
|
+
|
|
8
|
+
// ==================== ZIPF'S LAW ANALYSIS ====================
|
|
9
|
+
// Natural language follows Zipf's law: frequency ∝ 1/rank^α (α ≈ 1)
|
|
10
|
+
// Deviation from this indicates unnatural text (keyword stuffing)
|
|
11
|
+
|
|
12
|
+
export interface ZipfAnalysis {
|
|
13
|
+
alpha: number; // Exponent - natural text ≈ 0.9-1.1
|
|
14
|
+
isNatural: boolean;
|
|
15
|
+
rSquared: number; // Goodness of fit
|
|
16
|
+
stuffedKeywords: Array<{ word: string; deviation: number }>;
|
|
17
|
+
missingMidTail: string[];
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Analyze keyword distribution against Zipf's law
|
|
22
|
+
* Detects keyword stuffing through statistical distribution analysis
|
|
23
|
+
*/
|
|
24
|
+
export function analyzeZipfDistribution(html: string): ZipfAnalysis {
|
|
25
|
+
const $ = cheerio.load(html);
|
|
26
|
+
$('script, style, nav, footer, header').remove();
|
|
27
|
+
const text = $('body').text().toLowerCase();
|
|
28
|
+
|
|
29
|
+
// Extract words (min 3 chars to filter stopwords)
|
|
30
|
+
const words = text.match(/\b[a-z]{3,}\b/g) || [];
|
|
31
|
+
if (words.length < 100) {
|
|
32
|
+
return {
|
|
33
|
+
alpha: 1,
|
|
34
|
+
isNatural: true,
|
|
35
|
+
rSquared: 0,
|
|
36
|
+
stuffedKeywords: [],
|
|
37
|
+
missingMidTail: [],
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Count word frequencies
|
|
42
|
+
const freq = new Map<string, number>();
|
|
43
|
+
for (const word of words) {
|
|
44
|
+
freq.set(word, (freq.get(word) || 0) + 1);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Sort by frequency (descending)
|
|
48
|
+
const sorted = [...freq.entries()].sort((a, b) => b[1] - a[1]);
|
|
49
|
+
|
|
50
|
+
// Take top 100 words for analysis
|
|
51
|
+
const topWords = sorted.slice(0, Math.min(100, sorted.length));
|
|
52
|
+
|
|
53
|
+
// Fit Zipf's law using log-log linear regression
|
|
54
|
+
// log(freq) = log(C) - α * log(rank)
|
|
55
|
+
const logRanks = topWords.map((_, i) => Math.log(i + 1));
|
|
56
|
+
const logFreqs = topWords.map(([_, f]) => Math.log(f));
|
|
57
|
+
|
|
58
|
+
const { slope, intercept, rSquared } = linearRegression(logRanks, logFreqs);
|
|
59
|
+
const alpha = -slope; // α is negative of slope
|
|
60
|
+
|
|
61
|
+
// Find deviations (potential keyword stuffing)
|
|
62
|
+
const stuffedKeywords: Array<{ word: string; deviation: number }> = [];
|
|
63
|
+
const expectedC = Math.exp(intercept);
|
|
64
|
+
|
|
65
|
+
for (let i = 0; i < topWords.length; i++) {
|
|
66
|
+
const [word, actualFreq] = topWords[i];
|
|
67
|
+
const expectedFreq = expectedC * Math.pow(i + 1, -alpha);
|
|
68
|
+
const deviation = (actualFreq - expectedFreq) / expectedFreq;
|
|
69
|
+
|
|
70
|
+
// Flag words with >50% higher frequency than expected (excluding very common words)
|
|
71
|
+
if (deviation > 0.5 && word.length > 4 && i > 5) {
|
|
72
|
+
stuffedKeywords.push({ word, deviation: Math.round(deviation * 100) });
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Find missing mid-tail terms (ranks 20-50 with lower than expected frequency)
|
|
77
|
+
const missingMidTail: string[] = [];
|
|
78
|
+
for (let i = 20; i < Math.min(50, topWords.length); i++) {
|
|
79
|
+
const [word, actualFreq] = topWords[i];
|
|
80
|
+
const expectedFreq = expectedC * Math.pow(i + 1, -alpha);
|
|
81
|
+
const deviation = (actualFreq - expectedFreq) / expectedFreq;
|
|
82
|
+
|
|
83
|
+
if (deviation < -0.3) {
|
|
84
|
+
missingMidTail.push(word);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
alpha,
|
|
90
|
+
isNatural: alpha >= 0.8 && alpha <= 1.2 && rSquared > 0.85,
|
|
91
|
+
rSquared,
|
|
92
|
+
stuffedKeywords: stuffedKeywords.slice(0, 10),
|
|
93
|
+
missingMidTail: missingMidTail.slice(0, 5),
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ==================== SHANNON ENTROPY ====================
|
|
98
|
+
// Information entropy measures vocabulary diversity
|
|
99
|
+
// Low entropy = repetitive content, High entropy = diverse vocabulary
|
|
100
|
+
|
|
101
|
+
export interface EntropyAnalysis {
|
|
102
|
+
wordEntropy: number;
|
|
103
|
+
normalizedEntropy: number; // 0-1 scale
|
|
104
|
+
vocabularyRichness: number; // Type-token ratio
|
|
105
|
+
repetitionScore: number; // 0-1, higher = more repetitive
|
|
106
|
+
bigramEntropy: number;
|
|
107
|
+
qualityIndicator: 'excellent' | 'good' | 'average' | 'poor';
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Calculate Shannon entropy for content quality assessment
|
|
112
|
+
*/
|
|
113
|
+
export function analyzeEntropy(html: string): EntropyAnalysis {
|
|
114
|
+
const $ = cheerio.load(html);
|
|
115
|
+
$('script, style, nav, footer, header').remove();
|
|
116
|
+
const text = $('body').text().toLowerCase();
|
|
117
|
+
|
|
118
|
+
const words = text.match(/\b[a-z]{2,}\b/g) || [];
|
|
119
|
+
if (words.length < 50) {
|
|
120
|
+
return {
|
|
121
|
+
wordEntropy: 0,
|
|
122
|
+
normalizedEntropy: 0,
|
|
123
|
+
vocabularyRichness: 0,
|
|
124
|
+
repetitionScore: 1,
|
|
125
|
+
bigramEntropy: 0,
|
|
126
|
+
qualityIndicator: 'poor',
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Word frequency
|
|
131
|
+
const wordFreq = new Map<string, number>();
|
|
132
|
+
for (const word of words) {
|
|
133
|
+
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Bigram frequency
|
|
137
|
+
const bigramFreq = new Map<string, number>();
|
|
138
|
+
for (let i = 0; i < words.length - 1; i++) {
|
|
139
|
+
const bigram = `${words[i]} ${words[i + 1]}`;
|
|
140
|
+
bigramFreq.set(bigram, (bigramFreq.get(bigram) || 0) + 1);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Shannon entropy: H = -Σ p(x) * log2(p(x))
|
|
144
|
+
const wordEntropy = calculateShannonEntropy(wordFreq, words.length);
|
|
145
|
+
const bigramEntropy = calculateShannonEntropy(bigramFreq, words.length - 1);
|
|
146
|
+
|
|
147
|
+
// Maximum possible entropy (uniform distribution)
|
|
148
|
+
const maxEntropy = Math.log2(wordFreq.size);
|
|
149
|
+
const normalizedEntropy = maxEntropy > 0 ? wordEntropy / maxEntropy : 0;
|
|
150
|
+
|
|
151
|
+
// Type-token ratio (unique words / total words)
|
|
152
|
+
const vocabularyRichness = wordFreq.size / words.length;
|
|
153
|
+
|
|
154
|
+
// Repetition score (inverse of normalized entropy)
|
|
155
|
+
const repetitionScore = 1 - normalizedEntropy;
|
|
156
|
+
|
|
157
|
+
// Quality indicator
|
|
158
|
+
let qualityIndicator: EntropyAnalysis['qualityIndicator'];
|
|
159
|
+
if (normalizedEntropy > 0.85 && vocabularyRichness > 0.4) {
|
|
160
|
+
qualityIndicator = 'excellent';
|
|
161
|
+
} else if (normalizedEntropy > 0.75 && vocabularyRichness > 0.3) {
|
|
162
|
+
qualityIndicator = 'good';
|
|
163
|
+
} else if (normalizedEntropy > 0.6 && vocabularyRichness > 0.2) {
|
|
164
|
+
qualityIndicator = 'average';
|
|
165
|
+
} else {
|
|
166
|
+
qualityIndicator = 'poor';
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
wordEntropy,
|
|
171
|
+
normalizedEntropy,
|
|
172
|
+
vocabularyRichness,
|
|
173
|
+
repetitionScore,
|
|
174
|
+
bigramEntropy,
|
|
175
|
+
qualityIndicator,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function calculateShannonEntropy(freq: Map<string, number>, total: number): number {
|
|
180
|
+
let entropy = 0;
|
|
181
|
+
for (const count of freq.values()) {
|
|
182
|
+
const p = count / total;
|
|
183
|
+
if (p > 0) {
|
|
184
|
+
entropy -= p * Math.log2(p);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return entropy;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ==================== BM25 SCORING ====================
|
|
191
|
+
// Okapi BM25 is the ranking function used by Elasticsearch/Lucene
|
|
192
|
+
// More accurate than keyword density for relevance scoring
|
|
193
|
+
|
|
194
|
+
export interface BM25Analysis {
|
|
195
|
+
keyword: string;
|
|
196
|
+
score: number;
|
|
197
|
+
termFrequency: number;
|
|
198
|
+
saturationPoint: boolean; // TF above optimal
|
|
199
|
+
lengthPenalty: number;
|
|
200
|
+
recommendation: string;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Calculate BM25 relevance score for target keywords
|
|
205
|
+
* Parameters tuned for web content (k1=1.5, b=0.75)
|
|
206
|
+
*/
|
|
207
|
+
export function calculateBM25(
|
|
208
|
+
html: string,
|
|
209
|
+
keywords: string[],
|
|
210
|
+
avgDocLength: number = 1500, // Average web page word count
|
|
211
|
+
k1: number = 1.5,
|
|
212
|
+
b: number = 0.75
|
|
213
|
+
): BM25Analysis[] {
|
|
214
|
+
const $ = cheerio.load(html);
|
|
215
|
+
$('script, style').remove();
|
|
216
|
+
const text = $('body').text().toLowerCase();
|
|
217
|
+
const words = text.match(/\b[a-z]+\b/g) || [];
|
|
218
|
+
const docLength = words.length;
|
|
219
|
+
|
|
220
|
+
const results: BM25Analysis[] = [];
|
|
221
|
+
|
|
222
|
+
for (const keyword of keywords) {
|
|
223
|
+
const keywordLower = keyword.toLowerCase();
|
|
224
|
+
const keywordWords = keywordLower.split(/\s+/);
|
|
225
|
+
|
|
226
|
+
// Count exact phrase or individual word matches
|
|
227
|
+
let tf: number;
|
|
228
|
+
if (keywordWords.length > 1) {
|
|
229
|
+
// Phrase matching
|
|
230
|
+
const regex = new RegExp(keywordLower.replace(/\s+/g, '\\s+'), 'gi');
|
|
231
|
+
tf = (text.match(regex) || []).length;
|
|
232
|
+
} else {
|
|
233
|
+
// Single word
|
|
234
|
+
tf = words.filter((w) => w === keywordLower).length;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// BM25 formula (IDF = 1 for single document analysis)
|
|
238
|
+
const numerator = tf * (k1 + 1);
|
|
239
|
+
const denominator = tf + k1 * (1 - b + b * (docLength / avgDocLength));
|
|
240
|
+
const score = numerator / denominator;
|
|
241
|
+
|
|
242
|
+
// Check if additional occurrences would help (saturation check)
|
|
243
|
+
const nextTF = tf + 1;
|
|
244
|
+
const nextScore = (nextTF * (k1 + 1)) / (nextTF + k1 * (1 - b + b * (docLength / avgDocLength)));
|
|
245
|
+
const marginalGain = (nextScore - score) / (score || 1);
|
|
246
|
+
const saturationPoint = marginalGain < 0.1; // Less than 10% improvement
|
|
247
|
+
|
|
248
|
+
// Length penalty
|
|
249
|
+
const lengthPenalty = b * (docLength / avgDocLength - 1);
|
|
250
|
+
|
|
251
|
+
// Generate recommendation
|
|
252
|
+
let recommendation: string;
|
|
253
|
+
if (tf === 0) {
|
|
254
|
+
recommendation = `Add "${keyword}" to your content`;
|
|
255
|
+
} else if (saturationPoint) {
|
|
256
|
+
recommendation = `Keyword "${keyword}" is saturated (${tf} occurrences). Focus on semantic variations.`;
|
|
257
|
+
} else if (lengthPenalty > 0.3) {
|
|
258
|
+
recommendation = `Content is ${Math.round(lengthPenalty * 100)}% longer than average. Consider tightening.`;
|
|
259
|
+
} else if (tf < 3) {
|
|
260
|
+
recommendation = `Consider adding 1-2 more natural mentions of "${keyword}"`;
|
|
261
|
+
} else {
|
|
262
|
+
recommendation = `Good keyword presence for "${keyword}"`;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
results.push({
|
|
266
|
+
keyword,
|
|
267
|
+
score: Math.round(score * 100) / 100,
|
|
268
|
+
termFrequency: tf,
|
|
269
|
+
saturationPoint,
|
|
270
|
+
lengthPenalty: Math.round(lengthPenalty * 100) / 100,
|
|
271
|
+
recommendation,
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
return results;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// ==================== COSINE SIMILARITY ====================
|
|
279
|
+
// Better duplicate/cannibalization detection than exact matching
|
|
280
|
+
|
|
281
|
+
export interface SimilarityAnalysis {
|
|
282
|
+
similarity: number; // 0-1
|
|
283
|
+
sharedTerms: string[];
|
|
284
|
+
uniqueToA: string[];
|
|
285
|
+
uniqueToB: string[];
|
|
286
|
+
recommendation: 'duplicate' | 'cannibalization' | 'related' | 'distinct';
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Build TF-IDF vector for document
|
|
291
|
+
*/
|
|
292
|
+
export function buildTFIDFVector(text: string, vocabulary: string[]): number[] {
|
|
293
|
+
const words = text.toLowerCase().match(/\b[a-z]+\b/g) || [];
|
|
294
|
+
const wordCount = new Map<string, number>();
|
|
295
|
+
|
|
296
|
+
for (const word of words) {
|
|
297
|
+
wordCount.set(word, (wordCount.get(word) || 0) + 1);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
const docLength = words.length || 1;
|
|
301
|
+
const vector: number[] = [];
|
|
302
|
+
|
|
303
|
+
for (const term of vocabulary) {
|
|
304
|
+
const tf = (wordCount.get(term) || 0) / docLength;
|
|
305
|
+
// IDF approximation: log(2) for present terms, 0 for absent
|
|
306
|
+
const idf = wordCount.has(term) ? Math.log(2) : 0;
|
|
307
|
+
vector.push(tf * idf);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return vector;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Calculate cosine similarity between two content pieces
|
|
315
|
+
*/
|
|
316
|
+
export function calculateCosineSimilarity(textA: string, textB: string): SimilarityAnalysis {
|
|
317
|
+
const wordsA = new Set((textA.toLowerCase().match(/\b[a-z]{3,}\b/g) || []));
|
|
318
|
+
const wordsB = new Set((textB.toLowerCase().match(/\b[a-z]{3,}\b/g) || []));
|
|
319
|
+
|
|
320
|
+
// Build combined vocabulary
|
|
321
|
+
const vocabulary = [...new Set([...wordsA, ...wordsB])];
|
|
322
|
+
|
|
323
|
+
// Build TF-IDF vectors
|
|
324
|
+
const vectorA = buildTFIDFVector(textA, vocabulary);
|
|
325
|
+
const vectorB = buildTFIDFVector(textB, vocabulary);
|
|
326
|
+
|
|
327
|
+
// Cosine similarity
|
|
328
|
+
let dotProduct = 0;
|
|
329
|
+
let normA = 0;
|
|
330
|
+
let normB = 0;
|
|
331
|
+
|
|
332
|
+
for (let i = 0; i < vocabulary.length; i++) {
|
|
333
|
+
dotProduct += vectorA[i] * vectorB[i];
|
|
334
|
+
normA += vectorA[i] * vectorA[i];
|
|
335
|
+
normB += vectorB[i] * vectorB[i];
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
const similarity = normA > 0 && normB > 0 ? dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) : 0;
|
|
339
|
+
|
|
340
|
+
// Find shared and unique terms
|
|
341
|
+
const sharedTerms = [...wordsA].filter((w) => wordsB.has(w)).slice(0, 20);
|
|
342
|
+
const uniqueToA = [...wordsA].filter((w) => !wordsB.has(w)).slice(0, 10);
|
|
343
|
+
const uniqueToB = [...wordsB].filter((w) => !wordsA.has(w)).slice(0, 10);
|
|
344
|
+
|
|
345
|
+
// Recommendation based on similarity
|
|
346
|
+
let recommendation: SimilarityAnalysis['recommendation'];
|
|
347
|
+
if (similarity > 0.95) {
|
|
348
|
+
recommendation = 'duplicate';
|
|
349
|
+
} else if (similarity > 0.75) {
|
|
350
|
+
recommendation = 'cannibalization';
|
|
351
|
+
} else if (similarity > 0.5) {
|
|
352
|
+
recommendation = 'related';
|
|
353
|
+
} else {
|
|
354
|
+
recommendation = 'distinct';
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
return {
|
|
358
|
+
similarity: Math.round(similarity * 100) / 100,
|
|
359
|
+
sharedTerms,
|
|
360
|
+
uniqueToA,
|
|
361
|
+
uniqueToB,
|
|
362
|
+
recommendation,
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// ==================== INFORMATION GAIN ====================
|
|
367
|
+
// Identifies keywords that differentiate top-ranking content
|
|
368
|
+
|
|
369
|
+
export interface InformationGainResult {
|
|
370
|
+
keyword: string;
|
|
371
|
+
informationGain: number;
|
|
372
|
+
presentInTopContent: boolean;
|
|
373
|
+
priority: 'critical' | 'high' | 'medium' | 'low';
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Calculate information gain for keywords (simplified for single-document analysis)
|
|
378
|
+
* Compares content against expected term distribution
|
|
379
|
+
*/
|
|
380
|
+
export function calculateInformationGain(
|
|
381
|
+
html: string,
|
|
382
|
+
topicKeywords: string[]
|
|
383
|
+
): InformationGainResult[] {
|
|
384
|
+
const $ = cheerio.load(html);
|
|
385
|
+
$('script, style').remove();
|
|
386
|
+
const text = $('body').text().toLowerCase();
|
|
387
|
+
|
|
388
|
+
const results: InformationGainResult[] = [];
|
|
389
|
+
|
|
390
|
+
for (const keyword of topicKeywords) {
|
|
391
|
+
const keywordLower = keyword.toLowerCase();
|
|
392
|
+
const isPresent = text.includes(keywordLower);
|
|
393
|
+
|
|
394
|
+
// Simple information gain approximation
|
|
395
|
+
// Higher IG for important topic keywords that are missing
|
|
396
|
+
const wordLength = keywordLower.split(/\s+/).length;
|
|
397
|
+
const baseIG = wordLength > 1 ? 0.5 : 0.3; // Phrases are more specific
|
|
398
|
+
|
|
399
|
+
const informationGain = isPresent ? baseIG * 0.8 : baseIG;
|
|
400
|
+
|
|
401
|
+
let priority: InformationGainResult['priority'];
|
|
402
|
+
if (!isPresent && informationGain > 0.4) {
|
|
403
|
+
priority = 'critical';
|
|
404
|
+
} else if (!isPresent && informationGain > 0.3) {
|
|
405
|
+
priority = 'high';
|
|
406
|
+
} else if (!isPresent) {
|
|
407
|
+
priority = 'medium';
|
|
408
|
+
} else {
|
|
409
|
+
priority = 'low';
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
results.push({
|
|
413
|
+
keyword,
|
|
414
|
+
informationGain: Math.round(informationGain * 100) / 100,
|
|
415
|
+
presentInTopContent: isPresent,
|
|
416
|
+
priority,
|
|
417
|
+
});
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
return results.sort((a, b) => b.informationGain - a.informationGain);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// ==================== HELPER FUNCTIONS ====================
|
|
424
|
+
|
|
425
|
+
function linearRegression(x: number[], y: number[]): { slope: number; intercept: number; rSquared: number } {
|
|
426
|
+
const n = x.length;
|
|
427
|
+
let sumX = 0,
|
|
428
|
+
sumY = 0,
|
|
429
|
+
sumXY = 0,
|
|
430
|
+
sumXX = 0,
|
|
431
|
+
sumYY = 0;
|
|
432
|
+
|
|
433
|
+
for (let i = 0; i < n; i++) {
|
|
434
|
+
sumX += x[i];
|
|
435
|
+
sumY += y[i];
|
|
436
|
+
sumXY += x[i] * y[i];
|
|
437
|
+
sumXX += x[i] * x[i];
|
|
438
|
+
sumYY += y[i] * y[i];
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
const slope = (n * sumXY - sumX * sumY) / (n * sumXX - sumX * sumX);
|
|
442
|
+
const intercept = (sumY - slope * sumX) / n;
|
|
443
|
+
|
|
444
|
+
// R-squared
|
|
445
|
+
const yMean = sumY / n;
|
|
446
|
+
let ssRes = 0,
|
|
447
|
+
ssTot = 0;
|
|
448
|
+
for (let i = 0; i < n; i++) {
|
|
449
|
+
const predicted = slope * x[i] + intercept;
|
|
450
|
+
ssRes += (y[i] - predicted) ** 2;
|
|
451
|
+
ssTot += (y[i] - yMean) ** 2;
|
|
452
|
+
}
|
|
453
|
+
const rSquared = ssTot > 0 ? 1 - ssRes / ssTot : 0;
|
|
454
|
+
|
|
455
|
+
return { slope, intercept, rSquared };
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
// ==================== MAIN ANALYSIS FUNCTION ====================
|
|
459
|
+
|
|
460
|
+
export interface ContentScienceData {
|
|
461
|
+
zipf: ZipfAnalysis;
|
|
462
|
+
entropy: EntropyAnalysis;
|
|
463
|
+
bm25: BM25Analysis[];
|
|
464
|
+
overallQuality: 'excellent' | 'good' | 'needs-work' | 'poor';
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/**
|
|
468
|
+
* Run comprehensive content science analysis
|
|
469
|
+
*/
|
|
470
|
+
export function analyzeContentScience(
|
|
471
|
+
html: string,
|
|
472
|
+
url: string,
|
|
473
|
+
targetKeywords: string[] = []
|
|
474
|
+
): { issues: AuditIssue[]; data: ContentScienceData } {
|
|
475
|
+
const issues: AuditIssue[] = [];
|
|
476
|
+
|
|
477
|
+
// Zipf's Law Analysis
|
|
478
|
+
const zipf = analyzeZipfDistribution(html);
|
|
479
|
+
|
|
480
|
+
if (!zipf.isNatural && zipf.stuffedKeywords.length > 0) {
|
|
481
|
+
issues.push({
|
|
482
|
+
code: 'ZIPF_KEYWORD_STUFFING',
|
|
483
|
+
severity: 'warning',
|
|
484
|
+
category: 'content',
|
|
485
|
+
title: 'Unnatural keyword distribution detected',
|
|
486
|
+
description: `Content deviates from natural language patterns (Zipf α=${zipf.alpha.toFixed(2)}, expected ~1.0). Possible keyword stuffing detected.`,
|
|
487
|
+
impact: 'Search engines can detect unnatural text patterns, potentially triggering spam filters.',
|
|
488
|
+
howToFix: `Reduce repetition of: ${zipf.stuffedKeywords.map((k) => `"${k.word}" (+${k.deviation}%)`).join(', ')}`,
|
|
489
|
+
affectedUrls: [url],
|
|
490
|
+
details: {
|
|
491
|
+
alpha: zipf.alpha,
|
|
492
|
+
rSquared: zipf.rSquared,
|
|
493
|
+
stuffedKeywords: zipf.stuffedKeywords,
|
|
494
|
+
},
|
|
495
|
+
});
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// Entropy Analysis
|
|
499
|
+
const entropy = analyzeEntropy(html);
|
|
500
|
+
|
|
501
|
+
if (entropy.qualityIndicator === 'poor') {
|
|
502
|
+
issues.push({
|
|
503
|
+
code: 'LOW_CONTENT_ENTROPY',
|
|
504
|
+
severity: 'warning',
|
|
505
|
+
category: 'content',
|
|
506
|
+
title: 'Low vocabulary diversity (thin content signal)',
|
|
507
|
+
description: `Content entropy is low (${entropy.normalizedEntropy.toFixed(2)}). This indicates repetitive or thin content.`,
|
|
508
|
+
impact: 'Low diversity content may be seen as low-quality by search engines.',
|
|
509
|
+
howToFix: 'Expand vocabulary, add more unique insights, and reduce repetitive phrases.',
|
|
510
|
+
affectedUrls: [url],
|
|
511
|
+
details: {
|
|
512
|
+
normalizedEntropy: entropy.normalizedEntropy,
|
|
513
|
+
vocabularyRichness: entropy.vocabularyRichness,
|
|
514
|
+
repetitionScore: entropy.repetitionScore,
|
|
515
|
+
},
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
if (entropy.repetitionScore > 0.7) {
|
|
520
|
+
issues.push({
|
|
521
|
+
code: 'HIGH_CONTENT_REPETITION',
|
|
522
|
+
severity: 'notice',
|
|
523
|
+
category: 'content',
|
|
524
|
+
title: 'High content repetition detected',
|
|
525
|
+
description: `Content shows ${Math.round(entropy.repetitionScore * 100)}% repetition pattern.`,
|
|
526
|
+
impact: 'Highly repetitive content provides less value and may hurt engagement.',
|
|
527
|
+
howToFix: 'Vary your vocabulary and sentence structures. Avoid repeating the same phrases.',
|
|
528
|
+
affectedUrls: [url],
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// BM25 Analysis (if keywords provided)
|
|
533
|
+
const bm25 = targetKeywords.length > 0 ? calculateBM25(html, targetKeywords) : [];
|
|
534
|
+
|
|
535
|
+
for (const result of bm25) {
|
|
536
|
+
if (result.termFrequency === 0) {
|
|
537
|
+
issues.push({
|
|
538
|
+
code: 'BM25_KEYWORD_MISSING',
|
|
539
|
+
severity: 'warning',
|
|
540
|
+
category: 'on-page',
|
|
541
|
+
title: `Target keyword "${result.keyword}" not found`,
|
|
542
|
+
description: 'This keyword is not present in your content at all.',
|
|
543
|
+
impact: 'Page unlikely to rank for this keyword without any mention of it.',
|
|
544
|
+
howToFix: `Add natural mentions of "${result.keyword}" in your content.`,
|
|
545
|
+
affectedUrls: [url],
|
|
546
|
+
details: { keyword: result.keyword, bm25Score: 0 },
|
|
547
|
+
});
|
|
548
|
+
} else if (result.saturationPoint && result.termFrequency > 10) {
|
|
549
|
+
issues.push({
|
|
550
|
+
code: 'BM25_KEYWORD_SATURATED',
|
|
551
|
+
severity: 'notice',
|
|
552
|
+
category: 'on-page',
|
|
553
|
+
title: `Keyword "${result.keyword}" over-optimized`,
|
|
554
|
+
description: `Found ${result.termFrequency} times. Additional mentions provide diminishing returns.`,
|
|
555
|
+
impact: 'Over-optimization can trigger spam signals.',
|
|
556
|
+
howToFix: 'Use semantic variations and related terms instead of exact repetition.',
|
|
557
|
+
affectedUrls: [url],
|
|
558
|
+
details: { keyword: result.keyword, termFrequency: result.termFrequency },
|
|
559
|
+
});
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
// Overall quality assessment
|
|
564
|
+
let overallQuality: ContentScienceData['overallQuality'];
|
|
565
|
+
const qualityScore =
|
|
566
|
+
(zipf.isNatural ? 30 : 0) +
|
|
567
|
+
(entropy.qualityIndicator === 'excellent' ? 40 : entropy.qualityIndicator === 'good' ? 30 : entropy.qualityIndicator === 'average' ? 20 : 0) +
|
|
568
|
+
(bm25.filter((b) => b.termFrequency > 0 && !b.saturationPoint).length / Math.max(bm25.length, 1)) * 30;
|
|
569
|
+
|
|
570
|
+
if (qualityScore >= 80) {
|
|
571
|
+
overallQuality = 'excellent';
|
|
572
|
+
} else if (qualityScore >= 60) {
|
|
573
|
+
overallQuality = 'good';
|
|
574
|
+
} else if (qualityScore >= 40) {
|
|
575
|
+
overallQuality = 'needs-work';
|
|
576
|
+
} else {
|
|
577
|
+
overallQuality = 'poor';
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
return {
|
|
581
|
+
issues,
|
|
582
|
+
data: {
|
|
583
|
+
zipf,
|
|
584
|
+
entropy,
|
|
585
|
+
bm25,
|
|
586
|
+
overallQuality,
|
|
587
|
+
},
|
|
588
|
+
};
|
|
589
|
+
}
|