@rankcli/agent-runtime 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/analyzer-2CSWIQGD.mjs +6 -0
- package/dist/chunk-YNZYHEYM.mjs +774 -0
- package/dist/index.d.mts +4012 -0
- package/dist/index.d.ts +4012 -0
- package/dist/index.js +29672 -0
- package/dist/index.mjs +28602 -0
- package/package.json +53 -0
- package/scripts/build-deno.ts +134 -0
- package/src/audit/ai/analyzer.ts +347 -0
- package/src/audit/ai/index.ts +29 -0
- package/src/audit/ai/prompts/content-analysis.ts +271 -0
- package/src/audit/ai/types.ts +179 -0
- package/src/audit/checks/additional-checks.ts +439 -0
- package/src/audit/checks/ai-citation-worthiness.ts +399 -0
- package/src/audit/checks/ai-content-structure.ts +325 -0
- package/src/audit/checks/ai-readiness.ts +339 -0
- package/src/audit/checks/anchor-text.ts +179 -0
- package/src/audit/checks/answer-conciseness.ts +322 -0
- package/src/audit/checks/asset-minification.ts +270 -0
- package/src/audit/checks/bing-optimization.ts +206 -0
- package/src/audit/checks/brand-mention-optimization.ts +349 -0
- package/src/audit/checks/caching-headers.ts +305 -0
- package/src/audit/checks/canonical-advanced.ts +150 -0
- package/src/audit/checks/canonical-domain.ts +196 -0
- package/src/audit/checks/citation-quality.ts +358 -0
- package/src/audit/checks/client-rendering.ts +542 -0
- package/src/audit/checks/color-contrast.ts +342 -0
- package/src/audit/checks/content-freshness.ts +170 -0
- package/src/audit/checks/content-science.ts +589 -0
- package/src/audit/checks/conversion-elements.ts +526 -0
- package/src/audit/checks/crawlability.ts +220 -0
- package/src/audit/checks/directory-listing.ts +172 -0
- package/src/audit/checks/dom-analysis.ts +191 -0
- package/src/audit/checks/dom-size.ts +246 -0
- package/src/audit/checks/duplicate-content.ts +194 -0
- package/src/audit/checks/eeat-signals.ts +990 -0
- package/src/audit/checks/entity-seo.ts +396 -0
- package/src/audit/checks/featured-snippet.ts +473 -0
- package/src/audit/checks/freshness-signals.ts +443 -0
- package/src/audit/checks/funnel-intent.ts +463 -0
- package/src/audit/checks/hreflang.ts +174 -0
- package/src/audit/checks/html-compliance.ts +302 -0
- package/src/audit/checks/image-dimensions.ts +167 -0
- package/src/audit/checks/images.ts +160 -0
- package/src/audit/checks/indexnow.ts +275 -0
- package/src/audit/checks/interactive-tools.ts +475 -0
- package/src/audit/checks/internal-link-graph.ts +436 -0
- package/src/audit/checks/keyword-analysis.ts +239 -0
- package/src/audit/checks/keyword-cannibalization.ts +385 -0
- package/src/audit/checks/keyword-placement.ts +471 -0
- package/src/audit/checks/links.ts +203 -0
- package/src/audit/checks/llms-txt.ts +224 -0
- package/src/audit/checks/local-seo.ts +296 -0
- package/src/audit/checks/mobile.ts +167 -0
- package/src/audit/checks/modern-images.ts +226 -0
- package/src/audit/checks/navboost-signals.ts +395 -0
- package/src/audit/checks/on-page.ts +209 -0
- package/src/audit/checks/page-resources.ts +285 -0
- package/src/audit/checks/pagination.ts +180 -0
- package/src/audit/checks/performance.ts +153 -0
- package/src/audit/checks/platform-presence.ts +580 -0
- package/src/audit/checks/redirect-analysis.ts +153 -0
- package/src/audit/checks/redirect-chain.ts +389 -0
- package/src/audit/checks/resource-hints.ts +420 -0
- package/src/audit/checks/responsive-css.ts +247 -0
- package/src/audit/checks/responsive-images.ts +396 -0
- package/src/audit/checks/review-ecosystem.ts +415 -0
- package/src/audit/checks/robots-validation.ts +373 -0
- package/src/audit/checks/security-headers.ts +172 -0
- package/src/audit/checks/security.ts +144 -0
- package/src/audit/checks/serp-preview.ts +251 -0
- package/src/audit/checks/site-maturity.ts +444 -0
- package/src/audit/checks/social-meta.test.ts +275 -0
- package/src/audit/checks/social-meta.ts +134 -0
- package/src/audit/checks/soft-404.ts +151 -0
- package/src/audit/checks/structured-data.ts +238 -0
- package/src/audit/checks/tech-detection.ts +496 -0
- package/src/audit/checks/topical-clusters.ts +435 -0
- package/src/audit/checks/tracker-bloat.ts +462 -0
- package/src/audit/checks/tracking-verification.test.ts +371 -0
- package/src/audit/checks/tracking-verification.ts +636 -0
- package/src/audit/checks/url-safety.ts +682 -0
- package/src/audit/deno-entry.ts +66 -0
- package/src/audit/discovery/index.ts +15 -0
- package/src/audit/discovery/link-crawler.ts +232 -0
- package/src/audit/discovery/repo-routes.ts +347 -0
- package/src/audit/engine.ts +620 -0
- package/src/audit/fixes/index.ts +209 -0
- package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
- package/src/audit/fixes/social-meta-fixes.ts +463 -0
- package/src/audit/index.ts +74 -0
- package/src/audit/runner.test.ts +299 -0
- package/src/audit/runner.ts +130 -0
- package/src/audit/types.ts +1953 -0
- package/src/content/featured-snippet.ts +367 -0
- package/src/content/generator.test.ts +534 -0
- package/src/content/generator.ts +501 -0
- package/src/content/headline.ts +317 -0
- package/src/content/index.ts +62 -0
- package/src/content/intent.ts +258 -0
- package/src/content/keyword-density.ts +349 -0
- package/src/content/readability.ts +262 -0
- package/src/executor.ts +336 -0
- package/src/fixer.ts +416 -0
- package/src/frameworks/detector.test.ts +248 -0
- package/src/frameworks/detector.ts +371 -0
- package/src/frameworks/index.ts +68 -0
- package/src/frameworks/recipes/angular.yaml +171 -0
- package/src/frameworks/recipes/astro.yaml +206 -0
- package/src/frameworks/recipes/django.yaml +180 -0
- package/src/frameworks/recipes/laravel.yaml +137 -0
- package/src/frameworks/recipes/nextjs.yaml +268 -0
- package/src/frameworks/recipes/nuxt.yaml +175 -0
- package/src/frameworks/recipes/rails.yaml +188 -0
- package/src/frameworks/recipes/react.yaml +202 -0
- package/src/frameworks/recipes/sveltekit.yaml +154 -0
- package/src/frameworks/recipes/vue.yaml +137 -0
- package/src/frameworks/recipes/wordpress.yaml +209 -0
- package/src/frameworks/suggestion-engine.ts +320 -0
- package/src/geo/geo-content.test.ts +305 -0
- package/src/geo/geo-content.ts +266 -0
- package/src/geo/geo-history.test.ts +473 -0
- package/src/geo/geo-history.ts +433 -0
- package/src/geo/geo-tracker.test.ts +359 -0
- package/src/geo/geo-tracker.ts +411 -0
- package/src/geo/index.ts +10 -0
- package/src/git/commit-helper.test.ts +261 -0
- package/src/git/commit-helper.ts +329 -0
- package/src/git/index.ts +12 -0
- package/src/git/pr-helper.test.ts +284 -0
- package/src/git/pr-helper.ts +307 -0
- package/src/index.ts +66 -0
- package/src/keywords/ai-keyword-engine.ts +1062 -0
- package/src/keywords/ai-summarizer.ts +387 -0
- package/src/keywords/ci-mode.ts +555 -0
- package/src/keywords/engine.ts +359 -0
- package/src/keywords/index.ts +151 -0
- package/src/keywords/llm-judge.ts +357 -0
- package/src/keywords/nlp-analysis.ts +706 -0
- package/src/keywords/prioritizer.ts +295 -0
- package/src/keywords/site-crawler.ts +342 -0
- package/src/keywords/sources/autocomplete.ts +139 -0
- package/src/keywords/sources/competitive-search.ts +450 -0
- package/src/keywords/sources/competitor-analysis.ts +374 -0
- package/src/keywords/sources/dataforseo.ts +206 -0
- package/src/keywords/sources/free-sources.ts +294 -0
- package/src/keywords/sources/gsc.ts +123 -0
- package/src/keywords/topic-grouping.ts +327 -0
- package/src/keywords/types.ts +144 -0
- package/src/keywords/wizard.ts +457 -0
- package/src/loader.ts +40 -0
- package/src/reports/index.ts +7 -0
- package/src/reports/report-generator.test.ts +293 -0
- package/src/reports/report-generator.ts +713 -0
- package/src/scheduler/alerts.test.ts +458 -0
- package/src/scheduler/alerts.ts +328 -0
- package/src/scheduler/index.ts +8 -0
- package/src/scheduler/scheduled-audit.test.ts +377 -0
- package/src/scheduler/scheduled-audit.ts +149 -0
- package/src/test/integration-test.ts +325 -0
- package/src/tools/analyzer.ts +373 -0
- package/src/tools/crawl.ts +293 -0
- package/src/tools/files.ts +301 -0
- package/src/tools/h1-fixer.ts +249 -0
- package/src/tools/index.ts +67 -0
- package/src/tracking/github-action.ts +326 -0
- package/src/tracking/google-analytics.ts +265 -0
- package/src/tracking/index.ts +45 -0
- package/src/tracking/report-generator.ts +386 -0
- package/src/tracking/search-console.ts +335 -0
- package/src/types.ts +134 -0
- package/src/utils/http.ts +302 -0
- package/src/wasm-adapter.ts +297 -0
- package/src/wasm-entry.ts +14 -0
- package/tsconfig.json +17 -0
- package/tsup.wasm.config.ts +26 -0
- package/vitest.config.ts +15 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOM Size Check
|
|
3
|
+
*
|
|
4
|
+
* Analyzes DOM complexity which affects:
|
|
5
|
+
* - Memory usage
|
|
6
|
+
* - JavaScript performance
|
|
7
|
+
* - Style calculations
|
|
8
|
+
* - Layout/reflow operations
|
|
9
|
+
*
|
|
10
|
+
* Google Lighthouse thresholds:
|
|
11
|
+
* - Total elements: >1500 triggers warning, >3000 fails
|
|
12
|
+
* - Maximum depth: >32 triggers warning
|
|
13
|
+
* - Maximum children: >60 triggers warning
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import * as cheerio from 'cheerio';
|
|
17
|
+
import type { AuditIssue } from '../types.js';
|
|
18
|
+
|
|
19
|
+
export interface DomSizeData {
|
|
20
|
+
totalElements: number;
|
|
21
|
+
maxDepth: number;
|
|
22
|
+
maxChildren: number;
|
|
23
|
+
deepestPath: string[];
|
|
24
|
+
widestElement: {
|
|
25
|
+
selector: string;
|
|
26
|
+
childCount: number;
|
|
27
|
+
};
|
|
28
|
+
elementBreakdown: {
|
|
29
|
+
divs: number;
|
|
30
|
+
spans: number;
|
|
31
|
+
tables: number;
|
|
32
|
+
lists: number;
|
|
33
|
+
forms: number;
|
|
34
|
+
inputs: number;
|
|
35
|
+
images: number;
|
|
36
|
+
scripts: number;
|
|
37
|
+
svgs: number;
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Lighthouse thresholds
|
|
42
|
+
const THRESHOLDS = {
|
|
43
|
+
totalElements: {
|
|
44
|
+
warning: 1500,
|
|
45
|
+
error: 3000,
|
|
46
|
+
},
|
|
47
|
+
maxDepth: {
|
|
48
|
+
warning: 32,
|
|
49
|
+
error: 64,
|
|
50
|
+
},
|
|
51
|
+
maxChildren: {
|
|
52
|
+
warning: 60,
|
|
53
|
+
error: 120,
|
|
54
|
+
},
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
export function analyzeDomSize(
|
|
58
|
+
html: string,
|
|
59
|
+
url: string
|
|
60
|
+
): { issues: AuditIssue[]; data: DomSizeData } {
|
|
61
|
+
const issues: AuditIssue[] = [];
|
|
62
|
+
const $ = cheerio.load(html);
|
|
63
|
+
|
|
64
|
+
// Count total elements
|
|
65
|
+
const allElements = $('*');
|
|
66
|
+
const totalElements = allElements.length;
|
|
67
|
+
|
|
68
|
+
// Calculate max depth and find deepest path
|
|
69
|
+
let maxDepth = 0;
|
|
70
|
+
let deepestPath: string[] = [];
|
|
71
|
+
|
|
72
|
+
function getDepth(element: cheerio.Cheerio<cheerio.Element>, path: string[]): number {
|
|
73
|
+
const children = element.children();
|
|
74
|
+
if (children.length === 0) {
|
|
75
|
+
if (path.length > maxDepth) {
|
|
76
|
+
maxDepth = path.length;
|
|
77
|
+
deepestPath = [...path];
|
|
78
|
+
}
|
|
79
|
+
return path.length;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
let maxChildDepth = path.length;
|
|
83
|
+
children.each((_, child) => {
|
|
84
|
+
const $child = $(child);
|
|
85
|
+
const tagName = child.type === 'tag' ? child.name : '';
|
|
86
|
+
if (tagName) {
|
|
87
|
+
const childPath = [...path, tagName];
|
|
88
|
+
const childDepth = getDepth($child, childPath);
|
|
89
|
+
if (childDepth > maxChildDepth) {
|
|
90
|
+
maxChildDepth = childDepth;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
return maxChildDepth;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Start from body
|
|
99
|
+
const $body = $('body');
|
|
100
|
+
if ($body.length > 0) {
|
|
101
|
+
getDepth($body, ['body']);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Find element with most children
|
|
105
|
+
let maxChildren = 0;
|
|
106
|
+
let widestSelector = '';
|
|
107
|
+
|
|
108
|
+
allElements.each((_, element) => {
|
|
109
|
+
const $el = $(element);
|
|
110
|
+
const directChildren = $el.children().length;
|
|
111
|
+
if (directChildren > maxChildren) {
|
|
112
|
+
maxChildren = directChildren;
|
|
113
|
+
// Create a simple selector
|
|
114
|
+
const tagName = element.type === 'tag' ? element.name : '';
|
|
115
|
+
const id = $el.attr('id');
|
|
116
|
+
const className = $el.attr('class')?.split(' ')[0];
|
|
117
|
+
widestSelector = id ? `#${id}` : (className ? `${tagName}.${className}` : tagName);
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
// Element breakdown
|
|
122
|
+
const elementBreakdown = {
|
|
123
|
+
divs: $('div').length,
|
|
124
|
+
spans: $('span').length,
|
|
125
|
+
tables: $('table').length,
|
|
126
|
+
lists: $('ul, ol, li').length,
|
|
127
|
+
forms: $('form').length,
|
|
128
|
+
inputs: $('input, select, textarea').length,
|
|
129
|
+
images: $('img').length,
|
|
130
|
+
scripts: $('script').length,
|
|
131
|
+
svgs: $('svg').length,
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
// Generate issues
|
|
135
|
+
|
|
136
|
+
// Total elements check
|
|
137
|
+
if (totalElements > THRESHOLDS.totalElements.error) {
|
|
138
|
+
issues.push({
|
|
139
|
+
code: 'DOM_SIZE_EXCESSIVE',
|
|
140
|
+
severity: 'error',
|
|
141
|
+
category: 'performance',
|
|
142
|
+
title: 'Excessive DOM size',
|
|
143
|
+
description: `Page has ${totalElements.toLocaleString()} DOM elements (threshold: ${THRESHOLDS.totalElements.error.toLocaleString()}). This severely impacts performance.`,
|
|
144
|
+
impact: 'Significantly slower rendering, increased memory usage, and degraded interactivity. Users on low-end devices will experience major lag.',
|
|
145
|
+
howToFix: 'Reduce DOM complexity: virtualize long lists, lazy-load content, remove unnecessary wrapper elements, use CSS instead of extra markup.',
|
|
146
|
+
affectedUrls: [url],
|
|
147
|
+
details: {
|
|
148
|
+
totalElements,
|
|
149
|
+
breakdown: elementBreakdown,
|
|
150
|
+
suggestions: getDomReductionSuggestions(elementBreakdown),
|
|
151
|
+
},
|
|
152
|
+
});
|
|
153
|
+
} else if (totalElements > THRESHOLDS.totalElements.warning) {
|
|
154
|
+
issues.push({
|
|
155
|
+
code: 'DOM_SIZE_LARGE',
|
|
156
|
+
severity: 'warning',
|
|
157
|
+
category: 'performance',
|
|
158
|
+
title: 'Large DOM size',
|
|
159
|
+
description: `Page has ${totalElements.toLocaleString()} DOM elements (recommended: <${THRESHOLDS.totalElements.warning.toLocaleString()}).`,
|
|
160
|
+
impact: 'Slower style calculations, layout operations, and JavaScript performance.',
|
|
161
|
+
howToFix: 'Consider reducing DOM size: remove unnecessary elements, lazy-load off-screen content, virtualize long lists.',
|
|
162
|
+
affectedUrls: [url],
|
|
163
|
+
details: {
|
|
164
|
+
totalElements,
|
|
165
|
+
threshold: THRESHOLDS.totalElements.warning,
|
|
166
|
+
},
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Max depth check
|
|
171
|
+
if (maxDepth > THRESHOLDS.maxDepth.warning) {
|
|
172
|
+
issues.push({
|
|
173
|
+
code: 'DOM_DEPTH_EXCESSIVE',
|
|
174
|
+
severity: maxDepth > THRESHOLDS.maxDepth.error ? 'error' : 'warning',
|
|
175
|
+
category: 'performance',
|
|
176
|
+
title: 'Excessive DOM depth',
|
|
177
|
+
description: `Maximum DOM depth is ${maxDepth} levels (recommended: <${THRESHOLDS.maxDepth.warning}).`,
|
|
178
|
+
impact: 'Deep nesting increases style calculation time and can cause stack overflow in some browsers.',
|
|
179
|
+
howToFix: 'Flatten your HTML structure. Remove unnecessary wrapper divs. Use CSS flexbox/grid instead of nested containers.',
|
|
180
|
+
affectedUrls: [url],
|
|
181
|
+
details: {
|
|
182
|
+
maxDepth,
|
|
183
|
+
deepestPath: deepestPath.slice(0, 10).join(' > ') + (deepestPath.length > 10 ? '...' : ''),
|
|
184
|
+
},
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Max children check
|
|
189
|
+
if (maxChildren > THRESHOLDS.maxChildren.warning) {
|
|
190
|
+
issues.push({
|
|
191
|
+
code: 'DOM_CHILDREN_EXCESSIVE',
|
|
192
|
+
severity: maxChildren > THRESHOLDS.maxChildren.error ? 'error' : 'warning',
|
|
193
|
+
category: 'performance',
|
|
194
|
+
title: 'Element has too many children',
|
|
195
|
+
description: `An element has ${maxChildren} direct children (recommended: <${THRESHOLDS.maxChildren.warning}).`,
|
|
196
|
+
impact: 'Adding/removing children triggers expensive layout recalculations.',
|
|
197
|
+
howToFix: 'Break large lists into smaller chunks. Use virtual scrolling for long lists. Group items into sections.',
|
|
198
|
+
affectedUrls: [url],
|
|
199
|
+
details: {
|
|
200
|
+
maxChildren,
|
|
201
|
+
element: widestSelector,
|
|
202
|
+
},
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
issues,
|
|
208
|
+
data: {
|
|
209
|
+
totalElements,
|
|
210
|
+
maxDepth,
|
|
211
|
+
maxChildren,
|
|
212
|
+
deepestPath,
|
|
213
|
+
widestElement: {
|
|
214
|
+
selector: widestSelector,
|
|
215
|
+
childCount: maxChildren,
|
|
216
|
+
},
|
|
217
|
+
elementBreakdown,
|
|
218
|
+
},
|
|
219
|
+
};
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
function getDomReductionSuggestions(breakdown: DomSizeData['elementBreakdown']): string[] {
|
|
223
|
+
const suggestions: string[] = [];
|
|
224
|
+
|
|
225
|
+
if (breakdown.divs > 500) {
|
|
226
|
+
suggestions.push(`High div count (${breakdown.divs}): Review wrapper divs that may be unnecessary`);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (breakdown.spans > 200) {
|
|
230
|
+
suggestions.push(`High span count (${breakdown.spans}): Consider if all spans are needed for styling`);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (breakdown.tables > 10) {
|
|
234
|
+
suggestions.push(`Multiple tables (${breakdown.tables}): Consider CSS grid/flexbox for layouts`);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if (breakdown.lists > 100) {
|
|
238
|
+
suggestions.push(`Many list items (${breakdown.lists}): Consider virtualizing long lists`);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if (breakdown.svgs > 50) {
|
|
242
|
+
suggestions.push(`Many SVGs (${breakdown.svgs}): Consider SVG sprites or icon fonts`);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return suggestions;
|
|
246
|
+
}
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
import { createHash } from 'crypto';
|
|
3
|
+
import type { AuditIssue } from '../types.js';
|
|
4
|
+
import { ISSUE_DEFINITIONS } from '../types.js';
|
|
5
|
+
|
|
6
|
+
export interface ContentHashData {
|
|
7
|
+
url: string;
|
|
8
|
+
contentHash: string;
|
|
9
|
+
titleHash: string;
|
|
10
|
+
title?: string;
|
|
11
|
+
wordCount: number;
|
|
12
|
+
textSample: string;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface DuplicateDetectionResult {
|
|
16
|
+
duplicates: Array<{
|
|
17
|
+
url1: string;
|
|
18
|
+
url2: string;
|
|
19
|
+
similarity: number;
|
|
20
|
+
type: 'exact' | 'near-duplicate';
|
|
21
|
+
}>;
|
|
22
|
+
potentialCannibalization: Array<{
|
|
23
|
+
pages: string[];
|
|
24
|
+
keyword: string;
|
|
25
|
+
similarity: number;
|
|
26
|
+
}>;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Extract content hash for a page
|
|
30
|
+
export function extractContentHash(html: string, url: string): ContentHashData {
|
|
31
|
+
const $ = cheerio.load(html);
|
|
32
|
+
|
|
33
|
+
// Remove non-content elements
|
|
34
|
+
$('script, style, nav, header, footer, aside, .nav, .header, .footer, .sidebar').remove();
|
|
35
|
+
|
|
36
|
+
const title = $('title').text().trim();
|
|
37
|
+
const bodyText = $('body').text().replace(/\s+/g, ' ').trim();
|
|
38
|
+
|
|
39
|
+
// Create normalized content for hashing
|
|
40
|
+
const normalizedContent = bodyText.toLowerCase()
|
|
41
|
+
.replace(/[^\w\s]/g, '')
|
|
42
|
+
.replace(/\s+/g, ' ')
|
|
43
|
+
.trim();
|
|
44
|
+
|
|
45
|
+
const contentHash = createHash('md5').update(normalizedContent).digest('hex');
|
|
46
|
+
const titleHash = createHash('md5').update(title.toLowerCase()).digest('hex');
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
url,
|
|
50
|
+
contentHash,
|
|
51
|
+
titleHash,
|
|
52
|
+
title,
|
|
53
|
+
wordCount: normalizedContent.split(/\s+/).length,
|
|
54
|
+
textSample: normalizedContent.substring(0, 200),
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Calculate similarity between two strings using Jaccard index on shingles
|
|
59
|
+
function calculateSimilarity(text1: string, text2: string, shingleSize: number = 3): number {
|
|
60
|
+
const getShingles = (text: string): Set<string> => {
|
|
61
|
+
const shingles = new Set<string>();
|
|
62
|
+
const words = text.toLowerCase().split(/\s+/);
|
|
63
|
+
for (let i = 0; i <= words.length - shingleSize; i++) {
|
|
64
|
+
shingles.add(words.slice(i, i + shingleSize).join(' '));
|
|
65
|
+
}
|
|
66
|
+
return shingles;
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
const shingles1 = getShingles(text1);
|
|
70
|
+
const shingles2 = getShingles(text2);
|
|
71
|
+
|
|
72
|
+
if (shingles1.size === 0 && shingles2.size === 0) return 1;
|
|
73
|
+
if (shingles1.size === 0 || shingles2.size === 0) return 0;
|
|
74
|
+
|
|
75
|
+
let intersection = 0;
|
|
76
|
+
for (const shingle of shingles1) {
|
|
77
|
+
if (shingles2.has(shingle)) intersection++;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const union = shingles1.size + shingles2.size - intersection;
|
|
81
|
+
return intersection / union;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Detect duplicates across multiple pages
|
|
85
|
+
export function detectDuplicates(pages: ContentHashData[]): DuplicateDetectionResult {
|
|
86
|
+
const duplicates: DuplicateDetectionResult['duplicates'] = [];
|
|
87
|
+
const titleGroups: Map<string, ContentHashData[]> = new Map();
|
|
88
|
+
|
|
89
|
+
// Group by title hash for cannibalization detection
|
|
90
|
+
for (const page of pages) {
|
|
91
|
+
const existing = titleGroups.get(page.titleHash) || [];
|
|
92
|
+
existing.push(page);
|
|
93
|
+
titleGroups.set(page.titleHash, existing);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Find exact duplicates (same content hash)
|
|
97
|
+
const contentHashGroups: Map<string, ContentHashData[]> = new Map();
|
|
98
|
+
for (const page of pages) {
|
|
99
|
+
const existing = contentHashGroups.get(page.contentHash) || [];
|
|
100
|
+
existing.push(page);
|
|
101
|
+
contentHashGroups.set(page.contentHash, existing);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
for (const [, group] of contentHashGroups) {
|
|
105
|
+
if (group.length > 1) {
|
|
106
|
+
for (let i = 0; i < group.length - 1; i++) {
|
|
107
|
+
for (let j = i + 1; j < group.length; j++) {
|
|
108
|
+
duplicates.push({
|
|
109
|
+
url1: group[i].url,
|
|
110
|
+
url2: group[j].url,
|
|
111
|
+
similarity: 100,
|
|
112
|
+
type: 'exact',
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Find near-duplicates (high similarity but not exact)
|
|
120
|
+
for (let i = 0; i < pages.length - 1; i++) {
|
|
121
|
+
for (let j = i + 1; j < pages.length; j++) {
|
|
122
|
+
// Skip if already found as exact duplicate
|
|
123
|
+
if (pages[i].contentHash === pages[j].contentHash) continue;
|
|
124
|
+
|
|
125
|
+
// Only compare if word counts are similar (within 30%)
|
|
126
|
+
const wordDiff = Math.abs(pages[i].wordCount - pages[j].wordCount);
|
|
127
|
+
const avgWords = (pages[i].wordCount + pages[j].wordCount) / 2;
|
|
128
|
+
if (wordDiff / avgWords > 0.3) continue;
|
|
129
|
+
|
|
130
|
+
const similarity = calculateSimilarity(pages[i].textSample, pages[j].textSample);
|
|
131
|
+
if (similarity >= 0.9) {
|
|
132
|
+
duplicates.push({
|
|
133
|
+
url1: pages[i].url,
|
|
134
|
+
url2: pages[j].url,
|
|
135
|
+
similarity: Math.round(similarity * 100),
|
|
136
|
+
type: 'near-duplicate',
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Detect potential keyword cannibalization (same title)
|
|
143
|
+
const potentialCannibalization: DuplicateDetectionResult['potentialCannibalization'] = [];
|
|
144
|
+
for (const [, group] of titleGroups) {
|
|
145
|
+
if (group.length > 1 && group[0].title) {
|
|
146
|
+
potentialCannibalization.push({
|
|
147
|
+
pages: group.map(p => p.url),
|
|
148
|
+
keyword: group[0].title,
|
|
149
|
+
similarity: 100,
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return { duplicates, potentialCannibalization };
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Generate issues from detection results
|
|
158
|
+
export function generateDuplicateIssues(
|
|
159
|
+
result: DuplicateDetectionResult,
|
|
160
|
+
currentUrl: string
|
|
161
|
+
): AuditIssue[] {
|
|
162
|
+
const issues: AuditIssue[] = [];
|
|
163
|
+
|
|
164
|
+
// Near-duplicate issues
|
|
165
|
+
for (const dup of result.duplicates) {
|
|
166
|
+
if (dup.url1 === currentUrl || dup.url2 === currentUrl) {
|
|
167
|
+
issues.push({
|
|
168
|
+
...ISSUE_DEFINITIONS.NEAR_DUPLICATE,
|
|
169
|
+
affectedUrls: [dup.url1, dup.url2],
|
|
170
|
+
details: {
|
|
171
|
+
similarity: `${dup.similarity}%`,
|
|
172
|
+
type: dup.type,
|
|
173
|
+
otherUrl: dup.url1 === currentUrl ? dup.url2 : dup.url1,
|
|
174
|
+
},
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Cannibalization issues
|
|
180
|
+
for (const cannibal of result.potentialCannibalization) {
|
|
181
|
+
if (cannibal.pages.includes(currentUrl)) {
|
|
182
|
+
issues.push({
|
|
183
|
+
...ISSUE_DEFINITIONS.KEYWORD_CANNIBALIZATION,
|
|
184
|
+
affectedUrls: cannibal.pages,
|
|
185
|
+
details: {
|
|
186
|
+
keyword: cannibal.keyword,
|
|
187
|
+
pageCount: cannibal.pages.length,
|
|
188
|
+
},
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return issues;
|
|
194
|
+
}
|