@rankcli/agent-runtime 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +242 -0
  2. package/dist/analyzer-2CSWIQGD.mjs +6 -0
  3. package/dist/chunk-YNZYHEYM.mjs +774 -0
  4. package/dist/index.d.mts +4012 -0
  5. package/dist/index.d.ts +4012 -0
  6. package/dist/index.js +29672 -0
  7. package/dist/index.mjs +28602 -0
  8. package/package.json +53 -0
  9. package/scripts/build-deno.ts +134 -0
  10. package/src/audit/ai/analyzer.ts +347 -0
  11. package/src/audit/ai/index.ts +29 -0
  12. package/src/audit/ai/prompts/content-analysis.ts +271 -0
  13. package/src/audit/ai/types.ts +179 -0
  14. package/src/audit/checks/additional-checks.ts +439 -0
  15. package/src/audit/checks/ai-citation-worthiness.ts +399 -0
  16. package/src/audit/checks/ai-content-structure.ts +325 -0
  17. package/src/audit/checks/ai-readiness.ts +339 -0
  18. package/src/audit/checks/anchor-text.ts +179 -0
  19. package/src/audit/checks/answer-conciseness.ts +322 -0
  20. package/src/audit/checks/asset-minification.ts +270 -0
  21. package/src/audit/checks/bing-optimization.ts +206 -0
  22. package/src/audit/checks/brand-mention-optimization.ts +349 -0
  23. package/src/audit/checks/caching-headers.ts +305 -0
  24. package/src/audit/checks/canonical-advanced.ts +150 -0
  25. package/src/audit/checks/canonical-domain.ts +196 -0
  26. package/src/audit/checks/citation-quality.ts +358 -0
  27. package/src/audit/checks/client-rendering.ts +542 -0
  28. package/src/audit/checks/color-contrast.ts +342 -0
  29. package/src/audit/checks/content-freshness.ts +170 -0
  30. package/src/audit/checks/content-science.ts +589 -0
  31. package/src/audit/checks/conversion-elements.ts +526 -0
  32. package/src/audit/checks/crawlability.ts +220 -0
  33. package/src/audit/checks/directory-listing.ts +172 -0
  34. package/src/audit/checks/dom-analysis.ts +191 -0
  35. package/src/audit/checks/dom-size.ts +246 -0
  36. package/src/audit/checks/duplicate-content.ts +194 -0
  37. package/src/audit/checks/eeat-signals.ts +990 -0
  38. package/src/audit/checks/entity-seo.ts +396 -0
  39. package/src/audit/checks/featured-snippet.ts +473 -0
  40. package/src/audit/checks/freshness-signals.ts +443 -0
  41. package/src/audit/checks/funnel-intent.ts +463 -0
  42. package/src/audit/checks/hreflang.ts +174 -0
  43. package/src/audit/checks/html-compliance.ts +302 -0
  44. package/src/audit/checks/image-dimensions.ts +167 -0
  45. package/src/audit/checks/images.ts +160 -0
  46. package/src/audit/checks/indexnow.ts +275 -0
  47. package/src/audit/checks/interactive-tools.ts +475 -0
  48. package/src/audit/checks/internal-link-graph.ts +436 -0
  49. package/src/audit/checks/keyword-analysis.ts +239 -0
  50. package/src/audit/checks/keyword-cannibalization.ts +385 -0
  51. package/src/audit/checks/keyword-placement.ts +471 -0
  52. package/src/audit/checks/links.ts +203 -0
  53. package/src/audit/checks/llms-txt.ts +224 -0
  54. package/src/audit/checks/local-seo.ts +296 -0
  55. package/src/audit/checks/mobile.ts +167 -0
  56. package/src/audit/checks/modern-images.ts +226 -0
  57. package/src/audit/checks/navboost-signals.ts +395 -0
  58. package/src/audit/checks/on-page.ts +209 -0
  59. package/src/audit/checks/page-resources.ts +285 -0
  60. package/src/audit/checks/pagination.ts +180 -0
  61. package/src/audit/checks/performance.ts +153 -0
  62. package/src/audit/checks/platform-presence.ts +580 -0
  63. package/src/audit/checks/redirect-analysis.ts +153 -0
  64. package/src/audit/checks/redirect-chain.ts +389 -0
  65. package/src/audit/checks/resource-hints.ts +420 -0
  66. package/src/audit/checks/responsive-css.ts +247 -0
  67. package/src/audit/checks/responsive-images.ts +396 -0
  68. package/src/audit/checks/review-ecosystem.ts +415 -0
  69. package/src/audit/checks/robots-validation.ts +373 -0
  70. package/src/audit/checks/security-headers.ts +172 -0
  71. package/src/audit/checks/security.ts +144 -0
  72. package/src/audit/checks/serp-preview.ts +251 -0
  73. package/src/audit/checks/site-maturity.ts +444 -0
  74. package/src/audit/checks/social-meta.test.ts +275 -0
  75. package/src/audit/checks/social-meta.ts +134 -0
  76. package/src/audit/checks/soft-404.ts +151 -0
  77. package/src/audit/checks/structured-data.ts +238 -0
  78. package/src/audit/checks/tech-detection.ts +496 -0
  79. package/src/audit/checks/topical-clusters.ts +435 -0
  80. package/src/audit/checks/tracker-bloat.ts +462 -0
  81. package/src/audit/checks/tracking-verification.test.ts +371 -0
  82. package/src/audit/checks/tracking-verification.ts +636 -0
  83. package/src/audit/checks/url-safety.ts +682 -0
  84. package/src/audit/deno-entry.ts +66 -0
  85. package/src/audit/discovery/index.ts +15 -0
  86. package/src/audit/discovery/link-crawler.ts +232 -0
  87. package/src/audit/discovery/repo-routes.ts +347 -0
  88. package/src/audit/engine.ts +620 -0
  89. package/src/audit/fixes/index.ts +209 -0
  90. package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
  91. package/src/audit/fixes/social-meta-fixes.ts +463 -0
  92. package/src/audit/index.ts +74 -0
  93. package/src/audit/runner.test.ts +299 -0
  94. package/src/audit/runner.ts +130 -0
  95. package/src/audit/types.ts +1953 -0
  96. package/src/content/featured-snippet.ts +367 -0
  97. package/src/content/generator.test.ts +534 -0
  98. package/src/content/generator.ts +501 -0
  99. package/src/content/headline.ts +317 -0
  100. package/src/content/index.ts +62 -0
  101. package/src/content/intent.ts +258 -0
  102. package/src/content/keyword-density.ts +349 -0
  103. package/src/content/readability.ts +262 -0
  104. package/src/executor.ts +336 -0
  105. package/src/fixer.ts +416 -0
  106. package/src/frameworks/detector.test.ts +248 -0
  107. package/src/frameworks/detector.ts +371 -0
  108. package/src/frameworks/index.ts +68 -0
  109. package/src/frameworks/recipes/angular.yaml +171 -0
  110. package/src/frameworks/recipes/astro.yaml +206 -0
  111. package/src/frameworks/recipes/django.yaml +180 -0
  112. package/src/frameworks/recipes/laravel.yaml +137 -0
  113. package/src/frameworks/recipes/nextjs.yaml +268 -0
  114. package/src/frameworks/recipes/nuxt.yaml +175 -0
  115. package/src/frameworks/recipes/rails.yaml +188 -0
  116. package/src/frameworks/recipes/react.yaml +202 -0
  117. package/src/frameworks/recipes/sveltekit.yaml +154 -0
  118. package/src/frameworks/recipes/vue.yaml +137 -0
  119. package/src/frameworks/recipes/wordpress.yaml +209 -0
  120. package/src/frameworks/suggestion-engine.ts +320 -0
  121. package/src/geo/geo-content.test.ts +305 -0
  122. package/src/geo/geo-content.ts +266 -0
  123. package/src/geo/geo-history.test.ts +473 -0
  124. package/src/geo/geo-history.ts +433 -0
  125. package/src/geo/geo-tracker.test.ts +359 -0
  126. package/src/geo/geo-tracker.ts +411 -0
  127. package/src/geo/index.ts +10 -0
  128. package/src/git/commit-helper.test.ts +261 -0
  129. package/src/git/commit-helper.ts +329 -0
  130. package/src/git/index.ts +12 -0
  131. package/src/git/pr-helper.test.ts +284 -0
  132. package/src/git/pr-helper.ts +307 -0
  133. package/src/index.ts +66 -0
  134. package/src/keywords/ai-keyword-engine.ts +1062 -0
  135. package/src/keywords/ai-summarizer.ts +387 -0
  136. package/src/keywords/ci-mode.ts +555 -0
  137. package/src/keywords/engine.ts +359 -0
  138. package/src/keywords/index.ts +151 -0
  139. package/src/keywords/llm-judge.ts +357 -0
  140. package/src/keywords/nlp-analysis.ts +706 -0
  141. package/src/keywords/prioritizer.ts +295 -0
  142. package/src/keywords/site-crawler.ts +342 -0
  143. package/src/keywords/sources/autocomplete.ts +139 -0
  144. package/src/keywords/sources/competitive-search.ts +450 -0
  145. package/src/keywords/sources/competitor-analysis.ts +374 -0
  146. package/src/keywords/sources/dataforseo.ts +206 -0
  147. package/src/keywords/sources/free-sources.ts +294 -0
  148. package/src/keywords/sources/gsc.ts +123 -0
  149. package/src/keywords/topic-grouping.ts +327 -0
  150. package/src/keywords/types.ts +144 -0
  151. package/src/keywords/wizard.ts +457 -0
  152. package/src/loader.ts +40 -0
  153. package/src/reports/index.ts +7 -0
  154. package/src/reports/report-generator.test.ts +293 -0
  155. package/src/reports/report-generator.ts +713 -0
  156. package/src/scheduler/alerts.test.ts +458 -0
  157. package/src/scheduler/alerts.ts +328 -0
  158. package/src/scheduler/index.ts +8 -0
  159. package/src/scheduler/scheduled-audit.test.ts +377 -0
  160. package/src/scheduler/scheduled-audit.ts +149 -0
  161. package/src/test/integration-test.ts +325 -0
  162. package/src/tools/analyzer.ts +373 -0
  163. package/src/tools/crawl.ts +293 -0
  164. package/src/tools/files.ts +301 -0
  165. package/src/tools/h1-fixer.ts +249 -0
  166. package/src/tools/index.ts +67 -0
  167. package/src/tracking/github-action.ts +326 -0
  168. package/src/tracking/google-analytics.ts +265 -0
  169. package/src/tracking/index.ts +45 -0
  170. package/src/tracking/report-generator.ts +386 -0
  171. package/src/tracking/search-console.ts +335 -0
  172. package/src/types.ts +134 -0
  173. package/src/utils/http.ts +302 -0
  174. package/src/wasm-adapter.ts +297 -0
  175. package/src/wasm-entry.ts +14 -0
  176. package/tsconfig.json +17 -0
  177. package/tsup.wasm.config.ts +26 -0
  178. package/vitest.config.ts +15 -0
@@ -0,0 +1,246 @@
1
+ /**
2
+ * DOM Size Check
3
+ *
4
+ * Analyzes DOM complexity which affects:
5
+ * - Memory usage
6
+ * - JavaScript performance
7
+ * - Style calculations
8
+ * - Layout/reflow operations
9
+ *
10
+ * Google Lighthouse thresholds:
11
+ * - Total elements: >1500 triggers warning, >3000 fails
12
+ * - Maximum depth: >32 triggers warning
13
+ * - Maximum children: >60 triggers warning
14
+ */
15
+
16
+ import * as cheerio from 'cheerio';
17
+ import type { AuditIssue } from '../types.js';
18
+
19
+ export interface DomSizeData {
20
+ totalElements: number;
21
+ maxDepth: number;
22
+ maxChildren: number;
23
+ deepestPath: string[];
24
+ widestElement: {
25
+ selector: string;
26
+ childCount: number;
27
+ };
28
+ elementBreakdown: {
29
+ divs: number;
30
+ spans: number;
31
+ tables: number;
32
+ lists: number;
33
+ forms: number;
34
+ inputs: number;
35
+ images: number;
36
+ scripts: number;
37
+ svgs: number;
38
+ };
39
+ }
40
+
41
+ // Lighthouse thresholds
42
+ const THRESHOLDS = {
43
+ totalElements: {
44
+ warning: 1500,
45
+ error: 3000,
46
+ },
47
+ maxDepth: {
48
+ warning: 32,
49
+ error: 64,
50
+ },
51
+ maxChildren: {
52
+ warning: 60,
53
+ error: 120,
54
+ },
55
+ };
56
+
57
+ export function analyzeDomSize(
58
+ html: string,
59
+ url: string
60
+ ): { issues: AuditIssue[]; data: DomSizeData } {
61
+ const issues: AuditIssue[] = [];
62
+ const $ = cheerio.load(html);
63
+
64
+ // Count total elements
65
+ const allElements = $('*');
66
+ const totalElements = allElements.length;
67
+
68
+ // Calculate max depth and find deepest path
69
+ let maxDepth = 0;
70
+ let deepestPath: string[] = [];
71
+
72
+ function getDepth(element: cheerio.Cheerio<cheerio.Element>, path: string[]): number {
73
+ const children = element.children();
74
+ if (children.length === 0) {
75
+ if (path.length > maxDepth) {
76
+ maxDepth = path.length;
77
+ deepestPath = [...path];
78
+ }
79
+ return path.length;
80
+ }
81
+
82
+ let maxChildDepth = path.length;
83
+ children.each((_, child) => {
84
+ const $child = $(child);
85
+ const tagName = child.type === 'tag' ? child.name : '';
86
+ if (tagName) {
87
+ const childPath = [...path, tagName];
88
+ const childDepth = getDepth($child, childPath);
89
+ if (childDepth > maxChildDepth) {
90
+ maxChildDepth = childDepth;
91
+ }
92
+ }
93
+ });
94
+
95
+ return maxChildDepth;
96
+ }
97
+
98
+ // Start from body
99
+ const $body = $('body');
100
+ if ($body.length > 0) {
101
+ getDepth($body, ['body']);
102
+ }
103
+
104
+ // Find element with most children
105
+ let maxChildren = 0;
106
+ let widestSelector = '';
107
+
108
+ allElements.each((_, element) => {
109
+ const $el = $(element);
110
+ const directChildren = $el.children().length;
111
+ if (directChildren > maxChildren) {
112
+ maxChildren = directChildren;
113
+ // Create a simple selector
114
+ const tagName = element.type === 'tag' ? element.name : '';
115
+ const id = $el.attr('id');
116
+ const className = $el.attr('class')?.split(' ')[0];
117
+ widestSelector = id ? `#${id}` : (className ? `${tagName}.${className}` : tagName);
118
+ }
119
+ });
120
+
121
+ // Element breakdown
122
+ const elementBreakdown = {
123
+ divs: $('div').length,
124
+ spans: $('span').length,
125
+ tables: $('table').length,
126
+ lists: $('ul, ol, li').length,
127
+ forms: $('form').length,
128
+ inputs: $('input, select, textarea').length,
129
+ images: $('img').length,
130
+ scripts: $('script').length,
131
+ svgs: $('svg').length,
132
+ };
133
+
134
+ // Generate issues
135
+
136
+ // Total elements check
137
+ if (totalElements > THRESHOLDS.totalElements.error) {
138
+ issues.push({
139
+ code: 'DOM_SIZE_EXCESSIVE',
140
+ severity: 'error',
141
+ category: 'performance',
142
+ title: 'Excessive DOM size',
143
+ description: `Page has ${totalElements.toLocaleString()} DOM elements (threshold: ${THRESHOLDS.totalElements.error.toLocaleString()}). This severely impacts performance.`,
144
+ impact: 'Significantly slower rendering, increased memory usage, and degraded interactivity. Users on low-end devices will experience major lag.',
145
+ howToFix: 'Reduce DOM complexity: virtualize long lists, lazy-load content, remove unnecessary wrapper elements, use CSS instead of extra markup.',
146
+ affectedUrls: [url],
147
+ details: {
148
+ totalElements,
149
+ breakdown: elementBreakdown,
150
+ suggestions: getDomReductionSuggestions(elementBreakdown),
151
+ },
152
+ });
153
+ } else if (totalElements > THRESHOLDS.totalElements.warning) {
154
+ issues.push({
155
+ code: 'DOM_SIZE_LARGE',
156
+ severity: 'warning',
157
+ category: 'performance',
158
+ title: 'Large DOM size',
159
+ description: `Page has ${totalElements.toLocaleString()} DOM elements (recommended: <${THRESHOLDS.totalElements.warning.toLocaleString()}).`,
160
+ impact: 'Slower style calculations, layout operations, and JavaScript performance.',
161
+ howToFix: 'Consider reducing DOM size: remove unnecessary elements, lazy-load off-screen content, virtualize long lists.',
162
+ affectedUrls: [url],
163
+ details: {
164
+ totalElements,
165
+ threshold: THRESHOLDS.totalElements.warning,
166
+ },
167
+ });
168
+ }
169
+
170
+ // Max depth check
171
+ if (maxDepth > THRESHOLDS.maxDepth.warning) {
172
+ issues.push({
173
+ code: 'DOM_DEPTH_EXCESSIVE',
174
+ severity: maxDepth > THRESHOLDS.maxDepth.error ? 'error' : 'warning',
175
+ category: 'performance',
176
+ title: 'Excessive DOM depth',
177
+ description: `Maximum DOM depth is ${maxDepth} levels (recommended: <${THRESHOLDS.maxDepth.warning}).`,
178
+ impact: 'Deep nesting increases style calculation time and can cause stack overflow in some browsers.',
179
+ howToFix: 'Flatten your HTML structure. Remove unnecessary wrapper divs. Use CSS flexbox/grid instead of nested containers.',
180
+ affectedUrls: [url],
181
+ details: {
182
+ maxDepth,
183
+ deepestPath: deepestPath.slice(0, 10).join(' > ') + (deepestPath.length > 10 ? '...' : ''),
184
+ },
185
+ });
186
+ }
187
+
188
+ // Max children check
189
+ if (maxChildren > THRESHOLDS.maxChildren.warning) {
190
+ issues.push({
191
+ code: 'DOM_CHILDREN_EXCESSIVE',
192
+ severity: maxChildren > THRESHOLDS.maxChildren.error ? 'error' : 'warning',
193
+ category: 'performance',
194
+ title: 'Element has too many children',
195
+ description: `An element has ${maxChildren} direct children (recommended: <${THRESHOLDS.maxChildren.warning}).`,
196
+ impact: 'Adding/removing children triggers expensive layout recalculations.',
197
+ howToFix: 'Break large lists into smaller chunks. Use virtual scrolling for long lists. Group items into sections.',
198
+ affectedUrls: [url],
199
+ details: {
200
+ maxChildren,
201
+ element: widestSelector,
202
+ },
203
+ });
204
+ }
205
+
206
+ return {
207
+ issues,
208
+ data: {
209
+ totalElements,
210
+ maxDepth,
211
+ maxChildren,
212
+ deepestPath,
213
+ widestElement: {
214
+ selector: widestSelector,
215
+ childCount: maxChildren,
216
+ },
217
+ elementBreakdown,
218
+ },
219
+ };
220
+ }
221
+
222
+ function getDomReductionSuggestions(breakdown: DomSizeData['elementBreakdown']): string[] {
223
+ const suggestions: string[] = [];
224
+
225
+ if (breakdown.divs > 500) {
226
+ suggestions.push(`High div count (${breakdown.divs}): Review wrapper divs that may be unnecessary`);
227
+ }
228
+
229
+ if (breakdown.spans > 200) {
230
+ suggestions.push(`High span count (${breakdown.spans}): Consider if all spans are needed for styling`);
231
+ }
232
+
233
+ if (breakdown.tables > 10) {
234
+ suggestions.push(`Multiple tables (${breakdown.tables}): Consider CSS grid/flexbox for layouts`);
235
+ }
236
+
237
+ if (breakdown.lists > 100) {
238
+ suggestions.push(`Many list items (${breakdown.lists}): Consider virtualizing long lists`);
239
+ }
240
+
241
+ if (breakdown.svgs > 50) {
242
+ suggestions.push(`Many SVGs (${breakdown.svgs}): Consider SVG sprites or icon fonts`);
243
+ }
244
+
245
+ return suggestions;
246
+ }
@@ -0,0 +1,194 @@
1
+ import * as cheerio from 'cheerio';
2
+ import { createHash } from 'crypto';
3
+ import type { AuditIssue } from '../types.js';
4
+ import { ISSUE_DEFINITIONS } from '../types.js';
5
+
6
+ export interface ContentHashData {
7
+ url: string;
8
+ contentHash: string;
9
+ titleHash: string;
10
+ title?: string;
11
+ wordCount: number;
12
+ textSample: string;
13
+ }
14
+
15
+ export interface DuplicateDetectionResult {
16
+ duplicates: Array<{
17
+ url1: string;
18
+ url2: string;
19
+ similarity: number;
20
+ type: 'exact' | 'near-duplicate';
21
+ }>;
22
+ potentialCannibalization: Array<{
23
+ pages: string[];
24
+ keyword: string;
25
+ similarity: number;
26
+ }>;
27
+ }
28
+
29
+ // Extract content hash for a page
30
+ export function extractContentHash(html: string, url: string): ContentHashData {
31
+ const $ = cheerio.load(html);
32
+
33
+ // Remove non-content elements
34
+ $('script, style, nav, header, footer, aside, .nav, .header, .footer, .sidebar').remove();
35
+
36
+ const title = $('title').text().trim();
37
+ const bodyText = $('body').text().replace(/\s+/g, ' ').trim();
38
+
39
+ // Create normalized content for hashing
40
+ const normalizedContent = bodyText.toLowerCase()
41
+ .replace(/[^\w\s]/g, '')
42
+ .replace(/\s+/g, ' ')
43
+ .trim();
44
+
45
+ const contentHash = createHash('md5').update(normalizedContent).digest('hex');
46
+ const titleHash = createHash('md5').update(title.toLowerCase()).digest('hex');
47
+
48
+ return {
49
+ url,
50
+ contentHash,
51
+ titleHash,
52
+ title,
53
+ wordCount: normalizedContent.split(/\s+/).length,
54
+ textSample: normalizedContent.substring(0, 200),
55
+ };
56
+ }
57
+
58
+ // Calculate similarity between two strings using Jaccard index on shingles
59
+ function calculateSimilarity(text1: string, text2: string, shingleSize: number = 3): number {
60
+ const getShingles = (text: string): Set<string> => {
61
+ const shingles = new Set<string>();
62
+ const words = text.toLowerCase().split(/\s+/);
63
+ for (let i = 0; i <= words.length - shingleSize; i++) {
64
+ shingles.add(words.slice(i, i + shingleSize).join(' '));
65
+ }
66
+ return shingles;
67
+ };
68
+
69
+ const shingles1 = getShingles(text1);
70
+ const shingles2 = getShingles(text2);
71
+
72
+ if (shingles1.size === 0 && shingles2.size === 0) return 1;
73
+ if (shingles1.size === 0 || shingles2.size === 0) return 0;
74
+
75
+ let intersection = 0;
76
+ for (const shingle of shingles1) {
77
+ if (shingles2.has(shingle)) intersection++;
78
+ }
79
+
80
+ const union = shingles1.size + shingles2.size - intersection;
81
+ return intersection / union;
82
+ }
83
+
84
+ // Detect duplicates across multiple pages
85
+ export function detectDuplicates(pages: ContentHashData[]): DuplicateDetectionResult {
86
+ const duplicates: DuplicateDetectionResult['duplicates'] = [];
87
+ const titleGroups: Map<string, ContentHashData[]> = new Map();
88
+
89
+ // Group by title hash for cannibalization detection
90
+ for (const page of pages) {
91
+ const existing = titleGroups.get(page.titleHash) || [];
92
+ existing.push(page);
93
+ titleGroups.set(page.titleHash, existing);
94
+ }
95
+
96
+ // Find exact duplicates (same content hash)
97
+ const contentHashGroups: Map<string, ContentHashData[]> = new Map();
98
+ for (const page of pages) {
99
+ const existing = contentHashGroups.get(page.contentHash) || [];
100
+ existing.push(page);
101
+ contentHashGroups.set(page.contentHash, existing);
102
+ }
103
+
104
+ for (const [, group] of contentHashGroups) {
105
+ if (group.length > 1) {
106
+ for (let i = 0; i < group.length - 1; i++) {
107
+ for (let j = i + 1; j < group.length; j++) {
108
+ duplicates.push({
109
+ url1: group[i].url,
110
+ url2: group[j].url,
111
+ similarity: 100,
112
+ type: 'exact',
113
+ });
114
+ }
115
+ }
116
+ }
117
+ }
118
+
119
+ // Find near-duplicates (high similarity but not exact)
120
+ for (let i = 0; i < pages.length - 1; i++) {
121
+ for (let j = i + 1; j < pages.length; j++) {
122
+ // Skip if already found as exact duplicate
123
+ if (pages[i].contentHash === pages[j].contentHash) continue;
124
+
125
+ // Only compare if word counts are similar (within 30%)
126
+ const wordDiff = Math.abs(pages[i].wordCount - pages[j].wordCount);
127
+ const avgWords = (pages[i].wordCount + pages[j].wordCount) / 2;
128
+ if (wordDiff / avgWords > 0.3) continue;
129
+
130
+ const similarity = calculateSimilarity(pages[i].textSample, pages[j].textSample);
131
+ if (similarity >= 0.9) {
132
+ duplicates.push({
133
+ url1: pages[i].url,
134
+ url2: pages[j].url,
135
+ similarity: Math.round(similarity * 100),
136
+ type: 'near-duplicate',
137
+ });
138
+ }
139
+ }
140
+ }
141
+
142
+ // Detect potential keyword cannibalization (same title)
143
+ const potentialCannibalization: DuplicateDetectionResult['potentialCannibalization'] = [];
144
+ for (const [, group] of titleGroups) {
145
+ if (group.length > 1 && group[0].title) {
146
+ potentialCannibalization.push({
147
+ pages: group.map(p => p.url),
148
+ keyword: group[0].title,
149
+ similarity: 100,
150
+ });
151
+ }
152
+ }
153
+
154
+ return { duplicates, potentialCannibalization };
155
+ }
156
+
157
+ // Generate issues from detection results
158
+ export function generateDuplicateIssues(
159
+ result: DuplicateDetectionResult,
160
+ currentUrl: string
161
+ ): AuditIssue[] {
162
+ const issues: AuditIssue[] = [];
163
+
164
+ // Near-duplicate issues
165
+ for (const dup of result.duplicates) {
166
+ if (dup.url1 === currentUrl || dup.url2 === currentUrl) {
167
+ issues.push({
168
+ ...ISSUE_DEFINITIONS.NEAR_DUPLICATE,
169
+ affectedUrls: [dup.url1, dup.url2],
170
+ details: {
171
+ similarity: `${dup.similarity}%`,
172
+ type: dup.type,
173
+ otherUrl: dup.url1 === currentUrl ? dup.url2 : dup.url1,
174
+ },
175
+ });
176
+ }
177
+ }
178
+
179
+ // Cannibalization issues
180
+ for (const cannibal of result.potentialCannibalization) {
181
+ if (cannibal.pages.includes(currentUrl)) {
182
+ issues.push({
183
+ ...ISSUE_DEFINITIONS.KEYWORD_CANNIBALIZATION,
184
+ affectedUrls: cannibal.pages,
185
+ details: {
186
+ keyword: cannibal.keyword,
187
+ pageCount: cannibal.pages.length,
188
+ },
189
+ });
190
+ }
191
+ }
192
+
193
+ return issues;
194
+ }