@rankcli/agent-runtime 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +242 -0
  2. package/dist/analyzer-2CSWIQGD.mjs +6 -0
  3. package/dist/chunk-YNZYHEYM.mjs +774 -0
  4. package/dist/index.d.mts +4012 -0
  5. package/dist/index.d.ts +4012 -0
  6. package/dist/index.js +29672 -0
  7. package/dist/index.mjs +28602 -0
  8. package/package.json +53 -0
  9. package/scripts/build-deno.ts +134 -0
  10. package/src/audit/ai/analyzer.ts +347 -0
  11. package/src/audit/ai/index.ts +29 -0
  12. package/src/audit/ai/prompts/content-analysis.ts +271 -0
  13. package/src/audit/ai/types.ts +179 -0
  14. package/src/audit/checks/additional-checks.ts +439 -0
  15. package/src/audit/checks/ai-citation-worthiness.ts +399 -0
  16. package/src/audit/checks/ai-content-structure.ts +325 -0
  17. package/src/audit/checks/ai-readiness.ts +339 -0
  18. package/src/audit/checks/anchor-text.ts +179 -0
  19. package/src/audit/checks/answer-conciseness.ts +322 -0
  20. package/src/audit/checks/asset-minification.ts +270 -0
  21. package/src/audit/checks/bing-optimization.ts +206 -0
  22. package/src/audit/checks/brand-mention-optimization.ts +349 -0
  23. package/src/audit/checks/caching-headers.ts +305 -0
  24. package/src/audit/checks/canonical-advanced.ts +150 -0
  25. package/src/audit/checks/canonical-domain.ts +196 -0
  26. package/src/audit/checks/citation-quality.ts +358 -0
  27. package/src/audit/checks/client-rendering.ts +542 -0
  28. package/src/audit/checks/color-contrast.ts +342 -0
  29. package/src/audit/checks/content-freshness.ts +170 -0
  30. package/src/audit/checks/content-science.ts +589 -0
  31. package/src/audit/checks/conversion-elements.ts +526 -0
  32. package/src/audit/checks/crawlability.ts +220 -0
  33. package/src/audit/checks/directory-listing.ts +172 -0
  34. package/src/audit/checks/dom-analysis.ts +191 -0
  35. package/src/audit/checks/dom-size.ts +246 -0
  36. package/src/audit/checks/duplicate-content.ts +194 -0
  37. package/src/audit/checks/eeat-signals.ts +990 -0
  38. package/src/audit/checks/entity-seo.ts +396 -0
  39. package/src/audit/checks/featured-snippet.ts +473 -0
  40. package/src/audit/checks/freshness-signals.ts +443 -0
  41. package/src/audit/checks/funnel-intent.ts +463 -0
  42. package/src/audit/checks/hreflang.ts +174 -0
  43. package/src/audit/checks/html-compliance.ts +302 -0
  44. package/src/audit/checks/image-dimensions.ts +167 -0
  45. package/src/audit/checks/images.ts +160 -0
  46. package/src/audit/checks/indexnow.ts +275 -0
  47. package/src/audit/checks/interactive-tools.ts +475 -0
  48. package/src/audit/checks/internal-link-graph.ts +436 -0
  49. package/src/audit/checks/keyword-analysis.ts +239 -0
  50. package/src/audit/checks/keyword-cannibalization.ts +385 -0
  51. package/src/audit/checks/keyword-placement.ts +471 -0
  52. package/src/audit/checks/links.ts +203 -0
  53. package/src/audit/checks/llms-txt.ts +224 -0
  54. package/src/audit/checks/local-seo.ts +296 -0
  55. package/src/audit/checks/mobile.ts +167 -0
  56. package/src/audit/checks/modern-images.ts +226 -0
  57. package/src/audit/checks/navboost-signals.ts +395 -0
  58. package/src/audit/checks/on-page.ts +209 -0
  59. package/src/audit/checks/page-resources.ts +285 -0
  60. package/src/audit/checks/pagination.ts +180 -0
  61. package/src/audit/checks/performance.ts +153 -0
  62. package/src/audit/checks/platform-presence.ts +580 -0
  63. package/src/audit/checks/redirect-analysis.ts +153 -0
  64. package/src/audit/checks/redirect-chain.ts +389 -0
  65. package/src/audit/checks/resource-hints.ts +420 -0
  66. package/src/audit/checks/responsive-css.ts +247 -0
  67. package/src/audit/checks/responsive-images.ts +396 -0
  68. package/src/audit/checks/review-ecosystem.ts +415 -0
  69. package/src/audit/checks/robots-validation.ts +373 -0
  70. package/src/audit/checks/security-headers.ts +172 -0
  71. package/src/audit/checks/security.ts +144 -0
  72. package/src/audit/checks/serp-preview.ts +251 -0
  73. package/src/audit/checks/site-maturity.ts +444 -0
  74. package/src/audit/checks/social-meta.test.ts +275 -0
  75. package/src/audit/checks/social-meta.ts +134 -0
  76. package/src/audit/checks/soft-404.ts +151 -0
  77. package/src/audit/checks/structured-data.ts +238 -0
  78. package/src/audit/checks/tech-detection.ts +496 -0
  79. package/src/audit/checks/topical-clusters.ts +435 -0
  80. package/src/audit/checks/tracker-bloat.ts +462 -0
  81. package/src/audit/checks/tracking-verification.test.ts +371 -0
  82. package/src/audit/checks/tracking-verification.ts +636 -0
  83. package/src/audit/checks/url-safety.ts +682 -0
  84. package/src/audit/deno-entry.ts +66 -0
  85. package/src/audit/discovery/index.ts +15 -0
  86. package/src/audit/discovery/link-crawler.ts +232 -0
  87. package/src/audit/discovery/repo-routes.ts +347 -0
  88. package/src/audit/engine.ts +620 -0
  89. package/src/audit/fixes/index.ts +209 -0
  90. package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
  91. package/src/audit/fixes/social-meta-fixes.ts +463 -0
  92. package/src/audit/index.ts +74 -0
  93. package/src/audit/runner.test.ts +299 -0
  94. package/src/audit/runner.ts +130 -0
  95. package/src/audit/types.ts +1953 -0
  96. package/src/content/featured-snippet.ts +367 -0
  97. package/src/content/generator.test.ts +534 -0
  98. package/src/content/generator.ts +501 -0
  99. package/src/content/headline.ts +317 -0
  100. package/src/content/index.ts +62 -0
  101. package/src/content/intent.ts +258 -0
  102. package/src/content/keyword-density.ts +349 -0
  103. package/src/content/readability.ts +262 -0
  104. package/src/executor.ts +336 -0
  105. package/src/fixer.ts +416 -0
  106. package/src/frameworks/detector.test.ts +248 -0
  107. package/src/frameworks/detector.ts +371 -0
  108. package/src/frameworks/index.ts +68 -0
  109. package/src/frameworks/recipes/angular.yaml +171 -0
  110. package/src/frameworks/recipes/astro.yaml +206 -0
  111. package/src/frameworks/recipes/django.yaml +180 -0
  112. package/src/frameworks/recipes/laravel.yaml +137 -0
  113. package/src/frameworks/recipes/nextjs.yaml +268 -0
  114. package/src/frameworks/recipes/nuxt.yaml +175 -0
  115. package/src/frameworks/recipes/rails.yaml +188 -0
  116. package/src/frameworks/recipes/react.yaml +202 -0
  117. package/src/frameworks/recipes/sveltekit.yaml +154 -0
  118. package/src/frameworks/recipes/vue.yaml +137 -0
  119. package/src/frameworks/recipes/wordpress.yaml +209 -0
  120. package/src/frameworks/suggestion-engine.ts +320 -0
  121. package/src/geo/geo-content.test.ts +305 -0
  122. package/src/geo/geo-content.ts +266 -0
  123. package/src/geo/geo-history.test.ts +473 -0
  124. package/src/geo/geo-history.ts +433 -0
  125. package/src/geo/geo-tracker.test.ts +359 -0
  126. package/src/geo/geo-tracker.ts +411 -0
  127. package/src/geo/index.ts +10 -0
  128. package/src/git/commit-helper.test.ts +261 -0
  129. package/src/git/commit-helper.ts +329 -0
  130. package/src/git/index.ts +12 -0
  131. package/src/git/pr-helper.test.ts +284 -0
  132. package/src/git/pr-helper.ts +307 -0
  133. package/src/index.ts +66 -0
  134. package/src/keywords/ai-keyword-engine.ts +1062 -0
  135. package/src/keywords/ai-summarizer.ts +387 -0
  136. package/src/keywords/ci-mode.ts +555 -0
  137. package/src/keywords/engine.ts +359 -0
  138. package/src/keywords/index.ts +151 -0
  139. package/src/keywords/llm-judge.ts +357 -0
  140. package/src/keywords/nlp-analysis.ts +706 -0
  141. package/src/keywords/prioritizer.ts +295 -0
  142. package/src/keywords/site-crawler.ts +342 -0
  143. package/src/keywords/sources/autocomplete.ts +139 -0
  144. package/src/keywords/sources/competitive-search.ts +450 -0
  145. package/src/keywords/sources/competitor-analysis.ts +374 -0
  146. package/src/keywords/sources/dataforseo.ts +206 -0
  147. package/src/keywords/sources/free-sources.ts +294 -0
  148. package/src/keywords/sources/gsc.ts +123 -0
  149. package/src/keywords/topic-grouping.ts +327 -0
  150. package/src/keywords/types.ts +144 -0
  151. package/src/keywords/wizard.ts +457 -0
  152. package/src/loader.ts +40 -0
  153. package/src/reports/index.ts +7 -0
  154. package/src/reports/report-generator.test.ts +293 -0
  155. package/src/reports/report-generator.ts +713 -0
  156. package/src/scheduler/alerts.test.ts +458 -0
  157. package/src/scheduler/alerts.ts +328 -0
  158. package/src/scheduler/index.ts +8 -0
  159. package/src/scheduler/scheduled-audit.test.ts +377 -0
  160. package/src/scheduler/scheduled-audit.ts +149 -0
  161. package/src/test/integration-test.ts +325 -0
  162. package/src/tools/analyzer.ts +373 -0
  163. package/src/tools/crawl.ts +293 -0
  164. package/src/tools/files.ts +301 -0
  165. package/src/tools/h1-fixer.ts +249 -0
  166. package/src/tools/index.ts +67 -0
  167. package/src/tracking/github-action.ts +326 -0
  168. package/src/tracking/google-analytics.ts +265 -0
  169. package/src/tracking/index.ts +45 -0
  170. package/src/tracking/report-generator.ts +386 -0
  171. package/src/tracking/search-console.ts +335 -0
  172. package/src/types.ts +134 -0
  173. package/src/utils/http.ts +302 -0
  174. package/src/wasm-adapter.ts +297 -0
  175. package/src/wasm-entry.ts +14 -0
  176. package/tsconfig.json +17 -0
  177. package/tsup.wasm.config.ts +26 -0
  178. package/vitest.config.ts +15 -0
@@ -0,0 +1,385 @@
1
+ // Keyword Cannibalization Detection
2
+ // Based on Nathan Gotch's SEO technique: Detecting when multiple pages target the same keyword
3
+ // This causes pages to compete against each other in search results
4
+
5
+ import * as cheerio from 'cheerio';
6
+ import type { AuditIssue } from '../types.js';
7
+
8
+ export interface PageKeywordData {
9
+ url: string;
10
+ title: string;
11
+ h1: string[];
12
+ metaDescription: string;
13
+ primaryKeywords: string[];
14
+ keywordDensity: Map<string, number>;
15
+ }
16
+
17
+ export interface CannibalizationIssue {
18
+ keyword: string;
19
+ pages: string[];
20
+ similarity: number;
21
+ }
22
+
23
+ export interface KeywordCannibalizationData {
24
+ pagesAnalyzed: number;
25
+ potentialIssues: CannibalizationIssue[];
26
+ pageKeywords: PageKeywordData[];
27
+ }
28
+
29
+ /**
30
+ * Extract primary keywords from a page
31
+ */
32
+ export function extractPageKeywords(html: string, url: string): PageKeywordData {
33
+ const $ = cheerio.load(html);
34
+
35
+ // Get title
36
+ const title = $('title').text().trim() || '';
37
+
38
+ // Get H1s
39
+ const h1s: string[] = [];
40
+ $('h1').each((_, el) => {
41
+ h1s.push($(el).text().trim());
42
+ });
43
+
44
+ // Get meta description
45
+ const metaDescription = $('meta[name="description"]').attr('content')?.trim() || '';
46
+
47
+ // Extract text content
48
+ $('script, style, nav, footer, header').remove();
49
+ const bodyText = $('body').text().toLowerCase();
50
+
51
+ // Tokenize and count words
52
+ const words = bodyText
53
+ .replace(/[^a-z0-9\s]/g, ' ')
54
+ .split(/\s+/)
55
+ .filter((w) => w.length > 3);
56
+
57
+ // Count word frequency
58
+ const wordCount = new Map<string, number>();
59
+ for (const word of words) {
60
+ wordCount.set(word, (wordCount.get(word) || 0) + 1);
61
+ }
62
+
63
+ // Calculate keyword density
64
+ const totalWords = words.length;
65
+ const keywordDensity = new Map<string, number>();
66
+ for (const [word, count] of wordCount) {
67
+ keywordDensity.set(word, count / totalWords);
68
+ }
69
+
70
+ // Extract primary keywords from title and H1 (most important signals)
71
+ const titleWords = title
72
+ .toLowerCase()
73
+ .replace(/[^a-z0-9\s]/g, ' ')
74
+ .split(/\s+/)
75
+ .filter((w) => w.length > 3);
76
+
77
+ const h1Words = h1s
78
+ .join(' ')
79
+ .toLowerCase()
80
+ .replace(/[^a-z0-9\s]/g, ' ')
81
+ .split(/\s+/)
82
+ .filter((w) => w.length > 3);
83
+
84
+ // Primary keywords are those appearing in both title and H1, or high density
85
+ const primaryKeywords: string[] = [];
86
+ const seen = new Set<string>();
87
+
88
+ // Words in both title and H1 are likely primary keywords
89
+ for (const word of titleWords) {
90
+ if (h1Words.includes(word) && !seen.has(word)) {
91
+ primaryKeywords.push(word);
92
+ seen.add(word);
93
+ }
94
+ }
95
+
96
+ // Also include high-density words from title
97
+ for (const word of titleWords) {
98
+ const density = keywordDensity.get(word) || 0;
99
+ if (density > 0.01 && !seen.has(word)) {
100
+ primaryKeywords.push(word);
101
+ seen.add(word);
102
+ }
103
+ }
104
+
105
+ return {
106
+ url,
107
+ title,
108
+ h1: h1s,
109
+ metaDescription,
110
+ primaryKeywords,
111
+ keywordDensity,
112
+ };
113
+ }
114
+
115
+ /**
116
+ * Extract 2-word and 3-word phrases (n-grams) for better cannibalization detection
117
+ */
118
+ export function extractKeyPhrases(html: string): string[] {
119
+ const $ = cheerio.load(html);
120
+
121
+ const title = $('title').text().trim().toLowerCase();
122
+ const h1s = $('h1')
123
+ .map((_, el) => $(el).text().trim().toLowerCase())
124
+ .get()
125
+ .join(' ');
126
+ const metaDesc = ($('meta[name="description"]').attr('content') || '').toLowerCase();
127
+
128
+ const combinedText = `${title} ${h1s} ${metaDesc}`;
129
+
130
+ // Extract 2-word and 3-word phrases
131
+ const words = combinedText.replace(/[^a-z0-9\s]/g, ' ').split(/\s+/).filter(Boolean);
132
+ const phrases: string[] = [];
133
+
134
+ // 2-word phrases
135
+ for (let i = 0; i < words.length - 1; i++) {
136
+ if (words[i].length > 2 && words[i + 1].length > 2) {
137
+ phrases.push(`${words[i]} ${words[i + 1]}`);
138
+ }
139
+ }
140
+
141
+ // 3-word phrases
142
+ for (let i = 0; i < words.length - 2; i++) {
143
+ if (words[i].length > 2 && words[i + 1].length > 2 && words[i + 2].length > 2) {
144
+ phrases.push(`${words[i]} ${words[i + 1]} ${words[i + 2]}`);
145
+ }
146
+ }
147
+
148
+ return phrases;
149
+ }
150
+
151
+ /**
152
+ * Calculate Jaccard similarity between two sets of keywords
153
+ */
154
+ export function calculateKeywordSimilarity(keywords1: string[], keywords2: string[]): number {
155
+ if (keywords1.length === 0 || keywords2.length === 0) return 0;
156
+
157
+ const set1 = new Set(keywords1);
158
+ const set2 = new Set(keywords2);
159
+
160
+ const intersection = new Set([...set1].filter((x) => set2.has(x)));
161
+ const union = new Set([...set1, ...set2]);
162
+
163
+ return intersection.size / union.size;
164
+ }
165
+
166
+ /**
167
+ * Detect potential keyword cannibalization between pages
168
+ * This is used when crawling multiple pages
169
+ */
170
+ export function detectCannibalization(pages: PageKeywordData[]): CannibalizationIssue[] {
171
+ const issues: CannibalizationIssue[] = [];
172
+ const keywordToPages = new Map<string, string[]>();
173
+
174
+ // Build a map of keywords to pages
175
+ for (const page of pages) {
176
+ for (const keyword of page.primaryKeywords) {
177
+ const existing = keywordToPages.get(keyword) || [];
178
+ existing.push(page.url);
179
+ keywordToPages.set(keyword, existing);
180
+ }
181
+ }
182
+
183
+ // Find keywords targeting multiple pages
184
+ for (const [keyword, urls] of keywordToPages) {
185
+ if (urls.length > 1) {
186
+ // Calculate similarity between all pairs
187
+ let maxSimilarity = 0;
188
+
189
+ for (let i = 0; i < urls.length; i++) {
190
+ for (let j = i + 1; j < urls.length; j++) {
191
+ const page1 = pages.find((p) => p.url === urls[i]);
192
+ const page2 = pages.find((p) => p.url === urls[j]);
193
+
194
+ if (page1 && page2) {
195
+ const similarity = calculateKeywordSimilarity(page1.primaryKeywords, page2.primaryKeywords);
196
+ maxSimilarity = Math.max(maxSimilarity, similarity);
197
+ }
198
+ }
199
+ }
200
+
201
+ // Only flag if similarity is above threshold (0.5 = 50% overlap)
202
+ if (maxSimilarity > 0.5) {
203
+ issues.push({
204
+ keyword,
205
+ pages: urls,
206
+ similarity: maxSimilarity,
207
+ });
208
+ }
209
+ }
210
+ }
211
+
212
+ return issues;
213
+ }
214
+
215
+ /**
216
+ * Analyze a single page for cannibalization signals
217
+ * Used for single-page audits to detect potential self-cannibalization patterns
218
+ */
219
+ export function analyzeKeywordCannibalizationSinglePage(
220
+ html: string,
221
+ url: string
222
+ ): { issues: AuditIssue[]; data: PageKeywordData } {
223
+ const issues: AuditIssue[] = [];
224
+ const pageData = extractPageKeywords(html, url);
225
+
226
+ // Check for duplicate/similar H1 and title (common cannibalization signal)
227
+ if (pageData.h1.length > 1) {
228
+ const h1Texts = pageData.h1.map((h) => h.toLowerCase().trim());
229
+ const uniqueH1s = new Set(h1Texts);
230
+
231
+ if (uniqueH1s.size < h1Texts.length) {
232
+ issues.push({
233
+ code: 'INTERNAL_CANNIBALIZATION_SIGNAL',
234
+ severity: 'warning',
235
+ category: 'content',
236
+ title: 'Duplicate H1 tags on page',
237
+ description: `Page has ${h1Texts.length} H1 tags with duplicates, which can dilute keyword focus.`,
238
+ impact: 'Multiple similar H1s can confuse search engines about page topic.',
239
+ howToFix: 'Use a single, unique H1 that clearly defines the page topic.',
240
+ affectedUrls: [url],
241
+ details: { h1s: pageData.h1 },
242
+ });
243
+ }
244
+ }
245
+
246
+ // Check if title and H1 are targeting completely different keywords
247
+ const titleKeywords = new Set(
248
+ pageData.title
249
+ .toLowerCase()
250
+ .replace(/[^a-z0-9\s]/g, ' ')
251
+ .split(/\s+/)
252
+ .filter((w) => w.length > 3)
253
+ );
254
+
255
+ const h1Keywords = new Set(
256
+ pageData.h1
257
+ .join(' ')
258
+ .toLowerCase()
259
+ .replace(/[^a-z0-9\s]/g, ' ')
260
+ .split(/\s+/)
261
+ .filter((w) => w.length > 3)
262
+ );
263
+
264
+ // Calculate overlap
265
+ const intersection = new Set([...titleKeywords].filter((x) => h1Keywords.has(x)));
266
+
267
+ if (titleKeywords.size > 0 && h1Keywords.size > 0 && intersection.size === 0) {
268
+ issues.push({
269
+ code: 'TITLE_H1_KEYWORD_MISMATCH',
270
+ severity: 'warning',
271
+ category: 'on-page',
272
+ title: 'Title and H1 target different keywords',
273
+ description: 'The title tag and H1 heading have no overlapping keywords.',
274
+ impact: 'Search engines may be confused about the primary topic of the page.',
275
+ howToFix: 'Align title and H1 to target the same primary keyword or topic.',
276
+ affectedUrls: [url],
277
+ details: {
278
+ titleKeywords: [...titleKeywords],
279
+ h1Keywords: [...h1Keywords],
280
+ },
281
+ });
282
+ }
283
+
284
+ return { issues, data: pageData };
285
+ }
286
+
287
+ /**
288
+ * Full cannibalization analysis for multi-page crawls
289
+ */
290
+ export function analyzeKeywordCannibalization(
291
+ pagesHtml: Map<string, string>
292
+ ): { issues: AuditIssue[]; data: KeywordCannibalizationData } {
293
+ const issues: AuditIssue[] = [];
294
+ const pageKeywords: PageKeywordData[] = [];
295
+
296
+ // Extract keywords from all pages
297
+ for (const [url, html] of pagesHtml) {
298
+ const { data } = analyzeKeywordCannibalizationSinglePage(html, url);
299
+ pageKeywords.push(data);
300
+ }
301
+
302
+ // Detect cannibalization issues
303
+ const cannibalizationIssues = detectCannibalization(pageKeywords);
304
+
305
+ // Create audit issues for each cannibalization problem
306
+ for (const issue of cannibalizationIssues) {
307
+ issues.push({
308
+ code: 'KEYWORD_CANNIBALIZATION',
309
+ severity: 'warning',
310
+ category: 'content',
311
+ title: `Keyword cannibalization: "${issue.keyword}"`,
312
+ description: `Multiple pages (${issue.pages.length}) are targeting the same keyword "${issue.keyword}" with ${Math.round(issue.similarity * 100)}% keyword overlap.`,
313
+ impact: 'These pages compete against each other in search results, potentially hurting all their rankings.',
314
+ howToFix:
315
+ 'Consolidate pages into one comprehensive page, differentiate keyword targeting, or use canonical tags to indicate the preferred version.',
316
+ affectedUrls: issue.pages,
317
+ details: {
318
+ keyword: issue.keyword,
319
+ similarity: issue.similarity,
320
+ pageCount: issue.pages.length,
321
+ },
322
+ });
323
+ }
324
+
325
+ return {
326
+ issues,
327
+ data: {
328
+ pagesAnalyzed: pagesHtml.size,
329
+ potentialIssues: cannibalizationIssues,
330
+ pageKeywords,
331
+ },
332
+ };
333
+ }
334
+
335
+ /**
336
+ * Check for URL-level cannibalization signals
337
+ * (e.g., /chicago-lawyer and /chicago-attorney targeting same intent)
338
+ */
339
+ export function detectUrlCannibalizationPatterns(urls: string[]): string[][] {
340
+ const patterns: string[][] = [];
341
+ const synonymPairs = [
342
+ ['lawyer', 'attorney'],
343
+ ['best', 'top'],
344
+ ['guide', 'tutorial', 'how-to'],
345
+ ['review', 'reviews'],
346
+ ['buy', 'purchase', 'shop'],
347
+ ['cheap', 'affordable', 'budget'],
348
+ ['vs', 'versus', 'compared'],
349
+ ];
350
+
351
+ // Group URLs by base path (ignoring synonyms)
352
+ const urlGroups = new Map<string, string[]>();
353
+
354
+ for (const url of urls) {
355
+ try {
356
+ const parsed = new URL(url);
357
+ let normalizedPath = parsed.pathname.toLowerCase();
358
+
359
+ // Replace synonyms with canonical version
360
+ for (const synonymSet of synonymPairs) {
361
+ for (let i = 1; i < synonymSet.length; i++) {
362
+ normalizedPath = normalizedPath.replace(
363
+ new RegExp(`\\b${synonymSet[i]}\\b`, 'g'),
364
+ synonymSet[0]
365
+ );
366
+ }
367
+ }
368
+
369
+ const existing = urlGroups.get(normalizedPath) || [];
370
+ existing.push(url);
371
+ urlGroups.set(normalizedPath, existing);
372
+ } catch {
373
+ // Invalid URL, skip
374
+ }
375
+ }
376
+
377
+ // Find groups with multiple URLs (potential cannibalization)
378
+ for (const [, group] of urlGroups) {
379
+ if (group.length > 1) {
380
+ patterns.push(group);
381
+ }
382
+ }
383
+
384
+ return patterns;
385
+ }