@rankcli/agent-runtime 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/analyzer-2CSWIQGD.mjs +6 -0
- package/dist/chunk-YNZYHEYM.mjs +774 -0
- package/dist/index.d.mts +4012 -0
- package/dist/index.d.ts +4012 -0
- package/dist/index.js +29672 -0
- package/dist/index.mjs +28602 -0
- package/package.json +53 -0
- package/scripts/build-deno.ts +134 -0
- package/src/audit/ai/analyzer.ts +347 -0
- package/src/audit/ai/index.ts +29 -0
- package/src/audit/ai/prompts/content-analysis.ts +271 -0
- package/src/audit/ai/types.ts +179 -0
- package/src/audit/checks/additional-checks.ts +439 -0
- package/src/audit/checks/ai-citation-worthiness.ts +399 -0
- package/src/audit/checks/ai-content-structure.ts +325 -0
- package/src/audit/checks/ai-readiness.ts +339 -0
- package/src/audit/checks/anchor-text.ts +179 -0
- package/src/audit/checks/answer-conciseness.ts +322 -0
- package/src/audit/checks/asset-minification.ts +270 -0
- package/src/audit/checks/bing-optimization.ts +206 -0
- package/src/audit/checks/brand-mention-optimization.ts +349 -0
- package/src/audit/checks/caching-headers.ts +305 -0
- package/src/audit/checks/canonical-advanced.ts +150 -0
- package/src/audit/checks/canonical-domain.ts +196 -0
- package/src/audit/checks/citation-quality.ts +358 -0
- package/src/audit/checks/client-rendering.ts +542 -0
- package/src/audit/checks/color-contrast.ts +342 -0
- package/src/audit/checks/content-freshness.ts +170 -0
- package/src/audit/checks/content-science.ts +589 -0
- package/src/audit/checks/conversion-elements.ts +526 -0
- package/src/audit/checks/crawlability.ts +220 -0
- package/src/audit/checks/directory-listing.ts +172 -0
- package/src/audit/checks/dom-analysis.ts +191 -0
- package/src/audit/checks/dom-size.ts +246 -0
- package/src/audit/checks/duplicate-content.ts +194 -0
- package/src/audit/checks/eeat-signals.ts +990 -0
- package/src/audit/checks/entity-seo.ts +396 -0
- package/src/audit/checks/featured-snippet.ts +473 -0
- package/src/audit/checks/freshness-signals.ts +443 -0
- package/src/audit/checks/funnel-intent.ts +463 -0
- package/src/audit/checks/hreflang.ts +174 -0
- package/src/audit/checks/html-compliance.ts +302 -0
- package/src/audit/checks/image-dimensions.ts +167 -0
- package/src/audit/checks/images.ts +160 -0
- package/src/audit/checks/indexnow.ts +275 -0
- package/src/audit/checks/interactive-tools.ts +475 -0
- package/src/audit/checks/internal-link-graph.ts +436 -0
- package/src/audit/checks/keyword-analysis.ts +239 -0
- package/src/audit/checks/keyword-cannibalization.ts +385 -0
- package/src/audit/checks/keyword-placement.ts +471 -0
- package/src/audit/checks/links.ts +203 -0
- package/src/audit/checks/llms-txt.ts +224 -0
- package/src/audit/checks/local-seo.ts +296 -0
- package/src/audit/checks/mobile.ts +167 -0
- package/src/audit/checks/modern-images.ts +226 -0
- package/src/audit/checks/navboost-signals.ts +395 -0
- package/src/audit/checks/on-page.ts +209 -0
- package/src/audit/checks/page-resources.ts +285 -0
- package/src/audit/checks/pagination.ts +180 -0
- package/src/audit/checks/performance.ts +153 -0
- package/src/audit/checks/platform-presence.ts +580 -0
- package/src/audit/checks/redirect-analysis.ts +153 -0
- package/src/audit/checks/redirect-chain.ts +389 -0
- package/src/audit/checks/resource-hints.ts +420 -0
- package/src/audit/checks/responsive-css.ts +247 -0
- package/src/audit/checks/responsive-images.ts +396 -0
- package/src/audit/checks/review-ecosystem.ts +415 -0
- package/src/audit/checks/robots-validation.ts +373 -0
- package/src/audit/checks/security-headers.ts +172 -0
- package/src/audit/checks/security.ts +144 -0
- package/src/audit/checks/serp-preview.ts +251 -0
- package/src/audit/checks/site-maturity.ts +444 -0
- package/src/audit/checks/social-meta.test.ts +275 -0
- package/src/audit/checks/social-meta.ts +134 -0
- package/src/audit/checks/soft-404.ts +151 -0
- package/src/audit/checks/structured-data.ts +238 -0
- package/src/audit/checks/tech-detection.ts +496 -0
- package/src/audit/checks/topical-clusters.ts +435 -0
- package/src/audit/checks/tracker-bloat.ts +462 -0
- package/src/audit/checks/tracking-verification.test.ts +371 -0
- package/src/audit/checks/tracking-verification.ts +636 -0
- package/src/audit/checks/url-safety.ts +682 -0
- package/src/audit/deno-entry.ts +66 -0
- package/src/audit/discovery/index.ts +15 -0
- package/src/audit/discovery/link-crawler.ts +232 -0
- package/src/audit/discovery/repo-routes.ts +347 -0
- package/src/audit/engine.ts +620 -0
- package/src/audit/fixes/index.ts +209 -0
- package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
- package/src/audit/fixes/social-meta-fixes.ts +463 -0
- package/src/audit/index.ts +74 -0
- package/src/audit/runner.test.ts +299 -0
- package/src/audit/runner.ts +130 -0
- package/src/audit/types.ts +1953 -0
- package/src/content/featured-snippet.ts +367 -0
- package/src/content/generator.test.ts +534 -0
- package/src/content/generator.ts +501 -0
- package/src/content/headline.ts +317 -0
- package/src/content/index.ts +62 -0
- package/src/content/intent.ts +258 -0
- package/src/content/keyword-density.ts +349 -0
- package/src/content/readability.ts +262 -0
- package/src/executor.ts +336 -0
- package/src/fixer.ts +416 -0
- package/src/frameworks/detector.test.ts +248 -0
- package/src/frameworks/detector.ts +371 -0
- package/src/frameworks/index.ts +68 -0
- package/src/frameworks/recipes/angular.yaml +171 -0
- package/src/frameworks/recipes/astro.yaml +206 -0
- package/src/frameworks/recipes/django.yaml +180 -0
- package/src/frameworks/recipes/laravel.yaml +137 -0
- package/src/frameworks/recipes/nextjs.yaml +268 -0
- package/src/frameworks/recipes/nuxt.yaml +175 -0
- package/src/frameworks/recipes/rails.yaml +188 -0
- package/src/frameworks/recipes/react.yaml +202 -0
- package/src/frameworks/recipes/sveltekit.yaml +154 -0
- package/src/frameworks/recipes/vue.yaml +137 -0
- package/src/frameworks/recipes/wordpress.yaml +209 -0
- package/src/frameworks/suggestion-engine.ts +320 -0
- package/src/geo/geo-content.test.ts +305 -0
- package/src/geo/geo-content.ts +266 -0
- package/src/geo/geo-history.test.ts +473 -0
- package/src/geo/geo-history.ts +433 -0
- package/src/geo/geo-tracker.test.ts +359 -0
- package/src/geo/geo-tracker.ts +411 -0
- package/src/geo/index.ts +10 -0
- package/src/git/commit-helper.test.ts +261 -0
- package/src/git/commit-helper.ts +329 -0
- package/src/git/index.ts +12 -0
- package/src/git/pr-helper.test.ts +284 -0
- package/src/git/pr-helper.ts +307 -0
- package/src/index.ts +66 -0
- package/src/keywords/ai-keyword-engine.ts +1062 -0
- package/src/keywords/ai-summarizer.ts +387 -0
- package/src/keywords/ci-mode.ts +555 -0
- package/src/keywords/engine.ts +359 -0
- package/src/keywords/index.ts +151 -0
- package/src/keywords/llm-judge.ts +357 -0
- package/src/keywords/nlp-analysis.ts +706 -0
- package/src/keywords/prioritizer.ts +295 -0
- package/src/keywords/site-crawler.ts +342 -0
- package/src/keywords/sources/autocomplete.ts +139 -0
- package/src/keywords/sources/competitive-search.ts +450 -0
- package/src/keywords/sources/competitor-analysis.ts +374 -0
- package/src/keywords/sources/dataforseo.ts +206 -0
- package/src/keywords/sources/free-sources.ts +294 -0
- package/src/keywords/sources/gsc.ts +123 -0
- package/src/keywords/topic-grouping.ts +327 -0
- package/src/keywords/types.ts +144 -0
- package/src/keywords/wizard.ts +457 -0
- package/src/loader.ts +40 -0
- package/src/reports/index.ts +7 -0
- package/src/reports/report-generator.test.ts +293 -0
- package/src/reports/report-generator.ts +713 -0
- package/src/scheduler/alerts.test.ts +458 -0
- package/src/scheduler/alerts.ts +328 -0
- package/src/scheduler/index.ts +8 -0
- package/src/scheduler/scheduled-audit.test.ts +377 -0
- package/src/scheduler/scheduled-audit.ts +149 -0
- package/src/test/integration-test.ts +325 -0
- package/src/tools/analyzer.ts +373 -0
- package/src/tools/crawl.ts +293 -0
- package/src/tools/files.ts +301 -0
- package/src/tools/h1-fixer.ts +249 -0
- package/src/tools/index.ts +67 -0
- package/src/tracking/github-action.ts +326 -0
- package/src/tracking/google-analytics.ts +265 -0
- package/src/tracking/index.ts +45 -0
- package/src/tracking/report-generator.ts +386 -0
- package/src/tracking/search-console.ts +335 -0
- package/src/types.ts +134 -0
- package/src/utils/http.ts +302 -0
- package/src/wasm-adapter.ts +297 -0
- package/src/wasm-entry.ts +14 -0
- package/tsconfig.json +17 -0
- package/tsup.wasm.config.ts +26 -0
- package/vitest.config.ts +15 -0
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
// Keyword Cannibalization Detection
|
|
2
|
+
// Based on Nathan Gotch's SEO technique: Detecting when multiple pages target the same keyword
|
|
3
|
+
// This causes pages to compete against each other in search results
|
|
4
|
+
|
|
5
|
+
import * as cheerio from 'cheerio';
|
|
6
|
+
import type { AuditIssue } from '../types.js';
|
|
7
|
+
|
|
8
|
+
export interface PageKeywordData {
|
|
9
|
+
url: string;
|
|
10
|
+
title: string;
|
|
11
|
+
h1: string[];
|
|
12
|
+
metaDescription: string;
|
|
13
|
+
primaryKeywords: string[];
|
|
14
|
+
keywordDensity: Map<string, number>;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface CannibalizationIssue {
|
|
18
|
+
keyword: string;
|
|
19
|
+
pages: string[];
|
|
20
|
+
similarity: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface KeywordCannibalizationData {
|
|
24
|
+
pagesAnalyzed: number;
|
|
25
|
+
potentialIssues: CannibalizationIssue[];
|
|
26
|
+
pageKeywords: PageKeywordData[];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Extract primary keywords from a page
|
|
31
|
+
*/
|
|
32
|
+
export function extractPageKeywords(html: string, url: string): PageKeywordData {
|
|
33
|
+
const $ = cheerio.load(html);
|
|
34
|
+
|
|
35
|
+
// Get title
|
|
36
|
+
const title = $('title').text().trim() || '';
|
|
37
|
+
|
|
38
|
+
// Get H1s
|
|
39
|
+
const h1s: string[] = [];
|
|
40
|
+
$('h1').each((_, el) => {
|
|
41
|
+
h1s.push($(el).text().trim());
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
// Get meta description
|
|
45
|
+
const metaDescription = $('meta[name="description"]').attr('content')?.trim() || '';
|
|
46
|
+
|
|
47
|
+
// Extract text content
|
|
48
|
+
$('script, style, nav, footer, header').remove();
|
|
49
|
+
const bodyText = $('body').text().toLowerCase();
|
|
50
|
+
|
|
51
|
+
// Tokenize and count words
|
|
52
|
+
const words = bodyText
|
|
53
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
54
|
+
.split(/\s+/)
|
|
55
|
+
.filter((w) => w.length > 3);
|
|
56
|
+
|
|
57
|
+
// Count word frequency
|
|
58
|
+
const wordCount = new Map<string, number>();
|
|
59
|
+
for (const word of words) {
|
|
60
|
+
wordCount.set(word, (wordCount.get(word) || 0) + 1);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Calculate keyword density
|
|
64
|
+
const totalWords = words.length;
|
|
65
|
+
const keywordDensity = new Map<string, number>();
|
|
66
|
+
for (const [word, count] of wordCount) {
|
|
67
|
+
keywordDensity.set(word, count / totalWords);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Extract primary keywords from title and H1 (most important signals)
|
|
71
|
+
const titleWords = title
|
|
72
|
+
.toLowerCase()
|
|
73
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
74
|
+
.split(/\s+/)
|
|
75
|
+
.filter((w) => w.length > 3);
|
|
76
|
+
|
|
77
|
+
const h1Words = h1s
|
|
78
|
+
.join(' ')
|
|
79
|
+
.toLowerCase()
|
|
80
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
81
|
+
.split(/\s+/)
|
|
82
|
+
.filter((w) => w.length > 3);
|
|
83
|
+
|
|
84
|
+
// Primary keywords are those appearing in both title and H1, or high density
|
|
85
|
+
const primaryKeywords: string[] = [];
|
|
86
|
+
const seen = new Set<string>();
|
|
87
|
+
|
|
88
|
+
// Words in both title and H1 are likely primary keywords
|
|
89
|
+
for (const word of titleWords) {
|
|
90
|
+
if (h1Words.includes(word) && !seen.has(word)) {
|
|
91
|
+
primaryKeywords.push(word);
|
|
92
|
+
seen.add(word);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Also include high-density words from title
|
|
97
|
+
for (const word of titleWords) {
|
|
98
|
+
const density = keywordDensity.get(word) || 0;
|
|
99
|
+
if (density > 0.01 && !seen.has(word)) {
|
|
100
|
+
primaryKeywords.push(word);
|
|
101
|
+
seen.add(word);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
url,
|
|
107
|
+
title,
|
|
108
|
+
h1: h1s,
|
|
109
|
+
metaDescription,
|
|
110
|
+
primaryKeywords,
|
|
111
|
+
keywordDensity,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Extract 2-word and 3-word phrases (n-grams) for better cannibalization detection
|
|
117
|
+
*/
|
|
118
|
+
export function extractKeyPhrases(html: string): string[] {
|
|
119
|
+
const $ = cheerio.load(html);
|
|
120
|
+
|
|
121
|
+
const title = $('title').text().trim().toLowerCase();
|
|
122
|
+
const h1s = $('h1')
|
|
123
|
+
.map((_, el) => $(el).text().trim().toLowerCase())
|
|
124
|
+
.get()
|
|
125
|
+
.join(' ');
|
|
126
|
+
const metaDesc = ($('meta[name="description"]').attr('content') || '').toLowerCase();
|
|
127
|
+
|
|
128
|
+
const combinedText = `${title} ${h1s} ${metaDesc}`;
|
|
129
|
+
|
|
130
|
+
// Extract 2-word and 3-word phrases
|
|
131
|
+
const words = combinedText.replace(/[^a-z0-9\s]/g, ' ').split(/\s+/).filter(Boolean);
|
|
132
|
+
const phrases: string[] = [];
|
|
133
|
+
|
|
134
|
+
// 2-word phrases
|
|
135
|
+
for (let i = 0; i < words.length - 1; i++) {
|
|
136
|
+
if (words[i].length > 2 && words[i + 1].length > 2) {
|
|
137
|
+
phrases.push(`${words[i]} ${words[i + 1]}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// 3-word phrases
|
|
142
|
+
for (let i = 0; i < words.length - 2; i++) {
|
|
143
|
+
if (words[i].length > 2 && words[i + 1].length > 2 && words[i + 2].length > 2) {
|
|
144
|
+
phrases.push(`${words[i]} ${words[i + 1]} ${words[i + 2]}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return phrases;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Calculate Jaccard similarity between two sets of keywords
|
|
153
|
+
*/
|
|
154
|
+
export function calculateKeywordSimilarity(keywords1: string[], keywords2: string[]): number {
|
|
155
|
+
if (keywords1.length === 0 || keywords2.length === 0) return 0;
|
|
156
|
+
|
|
157
|
+
const set1 = new Set(keywords1);
|
|
158
|
+
const set2 = new Set(keywords2);
|
|
159
|
+
|
|
160
|
+
const intersection = new Set([...set1].filter((x) => set2.has(x)));
|
|
161
|
+
const union = new Set([...set1, ...set2]);
|
|
162
|
+
|
|
163
|
+
return intersection.size / union.size;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Detect potential keyword cannibalization between pages
|
|
168
|
+
* This is used when crawling multiple pages
|
|
169
|
+
*/
|
|
170
|
+
export function detectCannibalization(pages: PageKeywordData[]): CannibalizationIssue[] {
|
|
171
|
+
const issues: CannibalizationIssue[] = [];
|
|
172
|
+
const keywordToPages = new Map<string, string[]>();
|
|
173
|
+
|
|
174
|
+
// Build a map of keywords to pages
|
|
175
|
+
for (const page of pages) {
|
|
176
|
+
for (const keyword of page.primaryKeywords) {
|
|
177
|
+
const existing = keywordToPages.get(keyword) || [];
|
|
178
|
+
existing.push(page.url);
|
|
179
|
+
keywordToPages.set(keyword, existing);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Find keywords targeting multiple pages
|
|
184
|
+
for (const [keyword, urls] of keywordToPages) {
|
|
185
|
+
if (urls.length > 1) {
|
|
186
|
+
// Calculate similarity between all pairs
|
|
187
|
+
let maxSimilarity = 0;
|
|
188
|
+
|
|
189
|
+
for (let i = 0; i < urls.length; i++) {
|
|
190
|
+
for (let j = i + 1; j < urls.length; j++) {
|
|
191
|
+
const page1 = pages.find((p) => p.url === urls[i]);
|
|
192
|
+
const page2 = pages.find((p) => p.url === urls[j]);
|
|
193
|
+
|
|
194
|
+
if (page1 && page2) {
|
|
195
|
+
const similarity = calculateKeywordSimilarity(page1.primaryKeywords, page2.primaryKeywords);
|
|
196
|
+
maxSimilarity = Math.max(maxSimilarity, similarity);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Only flag if similarity is above threshold (0.5 = 50% overlap)
|
|
202
|
+
if (maxSimilarity > 0.5) {
|
|
203
|
+
issues.push({
|
|
204
|
+
keyword,
|
|
205
|
+
pages: urls,
|
|
206
|
+
similarity: maxSimilarity,
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
return issues;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Analyze a single page for cannibalization signals
|
|
217
|
+
* Used for single-page audits to detect potential self-cannibalization patterns
|
|
218
|
+
*/
|
|
219
|
+
export function analyzeKeywordCannibalizationSinglePage(
|
|
220
|
+
html: string,
|
|
221
|
+
url: string
|
|
222
|
+
): { issues: AuditIssue[]; data: PageKeywordData } {
|
|
223
|
+
const issues: AuditIssue[] = [];
|
|
224
|
+
const pageData = extractPageKeywords(html, url);
|
|
225
|
+
|
|
226
|
+
// Check for duplicate/similar H1 and title (common cannibalization signal)
|
|
227
|
+
if (pageData.h1.length > 1) {
|
|
228
|
+
const h1Texts = pageData.h1.map((h) => h.toLowerCase().trim());
|
|
229
|
+
const uniqueH1s = new Set(h1Texts);
|
|
230
|
+
|
|
231
|
+
if (uniqueH1s.size < h1Texts.length) {
|
|
232
|
+
issues.push({
|
|
233
|
+
code: 'INTERNAL_CANNIBALIZATION_SIGNAL',
|
|
234
|
+
severity: 'warning',
|
|
235
|
+
category: 'content',
|
|
236
|
+
title: 'Duplicate H1 tags on page',
|
|
237
|
+
description: `Page has ${h1Texts.length} H1 tags with duplicates, which can dilute keyword focus.`,
|
|
238
|
+
impact: 'Multiple similar H1s can confuse search engines about page topic.',
|
|
239
|
+
howToFix: 'Use a single, unique H1 that clearly defines the page topic.',
|
|
240
|
+
affectedUrls: [url],
|
|
241
|
+
details: { h1s: pageData.h1 },
|
|
242
|
+
});
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Check if title and H1 are targeting completely different keywords
|
|
247
|
+
const titleKeywords = new Set(
|
|
248
|
+
pageData.title
|
|
249
|
+
.toLowerCase()
|
|
250
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
251
|
+
.split(/\s+/)
|
|
252
|
+
.filter((w) => w.length > 3)
|
|
253
|
+
);
|
|
254
|
+
|
|
255
|
+
const h1Keywords = new Set(
|
|
256
|
+
pageData.h1
|
|
257
|
+
.join(' ')
|
|
258
|
+
.toLowerCase()
|
|
259
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
260
|
+
.split(/\s+/)
|
|
261
|
+
.filter((w) => w.length > 3)
|
|
262
|
+
);
|
|
263
|
+
|
|
264
|
+
// Calculate overlap
|
|
265
|
+
const intersection = new Set([...titleKeywords].filter((x) => h1Keywords.has(x)));
|
|
266
|
+
|
|
267
|
+
if (titleKeywords.size > 0 && h1Keywords.size > 0 && intersection.size === 0) {
|
|
268
|
+
issues.push({
|
|
269
|
+
code: 'TITLE_H1_KEYWORD_MISMATCH',
|
|
270
|
+
severity: 'warning',
|
|
271
|
+
category: 'on-page',
|
|
272
|
+
title: 'Title and H1 target different keywords',
|
|
273
|
+
description: 'The title tag and H1 heading have no overlapping keywords.',
|
|
274
|
+
impact: 'Search engines may be confused about the primary topic of the page.',
|
|
275
|
+
howToFix: 'Align title and H1 to target the same primary keyword or topic.',
|
|
276
|
+
affectedUrls: [url],
|
|
277
|
+
details: {
|
|
278
|
+
titleKeywords: [...titleKeywords],
|
|
279
|
+
h1Keywords: [...h1Keywords],
|
|
280
|
+
},
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return { issues, data: pageData };
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Full cannibalization analysis for multi-page crawls
|
|
289
|
+
*/
|
|
290
|
+
export function analyzeKeywordCannibalization(
|
|
291
|
+
pagesHtml: Map<string, string>
|
|
292
|
+
): { issues: AuditIssue[]; data: KeywordCannibalizationData } {
|
|
293
|
+
const issues: AuditIssue[] = [];
|
|
294
|
+
const pageKeywords: PageKeywordData[] = [];
|
|
295
|
+
|
|
296
|
+
// Extract keywords from all pages
|
|
297
|
+
for (const [url, html] of pagesHtml) {
|
|
298
|
+
const { data } = analyzeKeywordCannibalizationSinglePage(html, url);
|
|
299
|
+
pageKeywords.push(data);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// Detect cannibalization issues
|
|
303
|
+
const cannibalizationIssues = detectCannibalization(pageKeywords);
|
|
304
|
+
|
|
305
|
+
// Create audit issues for each cannibalization problem
|
|
306
|
+
for (const issue of cannibalizationIssues) {
|
|
307
|
+
issues.push({
|
|
308
|
+
code: 'KEYWORD_CANNIBALIZATION',
|
|
309
|
+
severity: 'warning',
|
|
310
|
+
category: 'content',
|
|
311
|
+
title: `Keyword cannibalization: "${issue.keyword}"`,
|
|
312
|
+
description: `Multiple pages (${issue.pages.length}) are targeting the same keyword "${issue.keyword}" with ${Math.round(issue.similarity * 100)}% keyword overlap.`,
|
|
313
|
+
impact: 'These pages compete against each other in search results, potentially hurting all their rankings.',
|
|
314
|
+
howToFix:
|
|
315
|
+
'Consolidate pages into one comprehensive page, differentiate keyword targeting, or use canonical tags to indicate the preferred version.',
|
|
316
|
+
affectedUrls: issue.pages,
|
|
317
|
+
details: {
|
|
318
|
+
keyword: issue.keyword,
|
|
319
|
+
similarity: issue.similarity,
|
|
320
|
+
pageCount: issue.pages.length,
|
|
321
|
+
},
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
return {
|
|
326
|
+
issues,
|
|
327
|
+
data: {
|
|
328
|
+
pagesAnalyzed: pagesHtml.size,
|
|
329
|
+
potentialIssues: cannibalizationIssues,
|
|
330
|
+
pageKeywords,
|
|
331
|
+
},
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Check for URL-level cannibalization signals
|
|
337
|
+
* (e.g., /chicago-lawyer and /chicago-attorney targeting same intent)
|
|
338
|
+
*/
|
|
339
|
+
export function detectUrlCannibalizationPatterns(urls: string[]): string[][] {
|
|
340
|
+
const patterns: string[][] = [];
|
|
341
|
+
const synonymPairs = [
|
|
342
|
+
['lawyer', 'attorney'],
|
|
343
|
+
['best', 'top'],
|
|
344
|
+
['guide', 'tutorial', 'how-to'],
|
|
345
|
+
['review', 'reviews'],
|
|
346
|
+
['buy', 'purchase', 'shop'],
|
|
347
|
+
['cheap', 'affordable', 'budget'],
|
|
348
|
+
['vs', 'versus', 'compared'],
|
|
349
|
+
];
|
|
350
|
+
|
|
351
|
+
// Group URLs by base path (ignoring synonyms)
|
|
352
|
+
const urlGroups = new Map<string, string[]>();
|
|
353
|
+
|
|
354
|
+
for (const url of urls) {
|
|
355
|
+
try {
|
|
356
|
+
const parsed = new URL(url);
|
|
357
|
+
let normalizedPath = parsed.pathname.toLowerCase();
|
|
358
|
+
|
|
359
|
+
// Replace synonyms with canonical version
|
|
360
|
+
for (const synonymSet of synonymPairs) {
|
|
361
|
+
for (let i = 1; i < synonymSet.length; i++) {
|
|
362
|
+
normalizedPath = normalizedPath.replace(
|
|
363
|
+
new RegExp(`\\b${synonymSet[i]}\\b`, 'g'),
|
|
364
|
+
synonymSet[0]
|
|
365
|
+
);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
const existing = urlGroups.get(normalizedPath) || [];
|
|
370
|
+
existing.push(url);
|
|
371
|
+
urlGroups.set(normalizedPath, existing);
|
|
372
|
+
} catch {
|
|
373
|
+
// Invalid URL, skip
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Find groups with multiple URLs (potential cannibalization)
|
|
378
|
+
for (const [, group] of urlGroups) {
|
|
379
|
+
if (group.length > 1) {
|
|
380
|
+
patterns.push(group);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
return patterns;
|
|
385
|
+
}
|