aeorank 3.0.3 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -24
- package/dist/browser.d.ts +3 -3
- package/dist/browser.js +429 -24
- package/dist/browser.js.map +1 -1
- package/dist/cli.js +365 -20
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +429 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -3
- package/dist/index.d.ts +3 -3
- package/dist/index.js +429 -24
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/cli.js
CHANGED
|
@@ -76,6 +76,75 @@ function detectParkedDomain(bodySnippet) {
|
|
|
76
76
|
return { isParked: false };
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
+
// src/duplicate-content.ts
|
|
80
|
+
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
81
|
+
var MIN_SUBSTANTIVE_WORDS = 15;
|
|
82
|
+
var MAX_METADATA_WORDS = 24;
|
|
83
|
+
var MAX_METADATA_LABEL_WORDS = 4;
|
|
84
|
+
function normalizeParagraphText(htmlFragment) {
|
|
85
|
+
return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
86
|
+
}
|
|
87
|
+
function tokenize(text) {
|
|
88
|
+
return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
|
|
89
|
+
}
|
|
90
|
+
function isBoilerplateParagraph(text, words) {
|
|
91
|
+
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
92
|
+
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
function isMetadataParagraph(text, words) {
|
|
96
|
+
const labelMatch = text.match(/^([^:]{1,60}):\s+/);
|
|
97
|
+
if (!labelMatch) return false;
|
|
98
|
+
const labelWords = tokenize(labelMatch[1]).length;
|
|
99
|
+
return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
|
|
100
|
+
}
|
|
101
|
+
function buildShinglesFromTokens(words, n = 4) {
|
|
102
|
+
const shingles = /* @__PURE__ */ new Set();
|
|
103
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
104
|
+
shingles.add(words.slice(i, i + n).join(" "));
|
|
105
|
+
}
|
|
106
|
+
return shingles;
|
|
107
|
+
}
|
|
108
|
+
function createParagraph(htmlFragment) {
|
|
109
|
+
const text = normalizeParagraphText(htmlFragment);
|
|
110
|
+
const words = tokenize(text);
|
|
111
|
+
if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
|
|
112
|
+
if (isBoilerplateParagraph(text, words.length)) return null;
|
|
113
|
+
if (isMetadataParagraph(text, words.length)) return null;
|
|
114
|
+
const shingles = buildShinglesFromTokens(words);
|
|
115
|
+
if (shingles.size < 3) return null;
|
|
116
|
+
return { text, shingles };
|
|
117
|
+
}
|
|
118
|
+
function stripNonContentHtml(html) {
|
|
119
|
+
return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
120
|
+
}
|
|
121
|
+
function extractDuplicateContentParagraphs(html) {
|
|
122
|
+
const cleaned = stripNonContentHtml(html);
|
|
123
|
+
const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
124
|
+
return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
125
|
+
}
|
|
126
|
+
function extractDuplicateContentSections(html) {
|
|
127
|
+
const cleaned = stripNonContentHtml(html);
|
|
128
|
+
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
129
|
+
const sections = [];
|
|
130
|
+
for (const part of parts) {
|
|
131
|
+
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
132
|
+
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
133
|
+
const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
134
|
+
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
135
|
+
}
|
|
136
|
+
return sections;
|
|
137
|
+
}
|
|
138
|
+
function shingleJaccardSimilarity(a, b) {
|
|
139
|
+
if (a.size === 0 && b.size === 0) return 0;
|
|
140
|
+
let intersection = 0;
|
|
141
|
+
for (const shingle of a) {
|
|
142
|
+
if (b.has(shingle)) intersection++;
|
|
143
|
+
}
|
|
144
|
+
const union = a.size + b.size - intersection;
|
|
145
|
+
return union === 0 ? 0 : intersection / union;
|
|
146
|
+
}
|
|
147
|
+
|
|
79
148
|
// src/site-crawler.ts
|
|
80
149
|
async function fetchText(url) {
|
|
81
150
|
try {
|
|
@@ -2556,6 +2625,186 @@ function checkImageContextAI(data) {
|
|
|
2556
2625
|
}
|
|
2557
2626
|
return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
|
|
2558
2627
|
}
|
|
2628
|
+
function findIntraPageDuplicates(html) {
|
|
2629
|
+
const sections = extractDuplicateContentSections(html);
|
|
2630
|
+
if (sections.length < 2) return [];
|
|
2631
|
+
const pairs = [];
|
|
2632
|
+
for (let i = 0; i < sections.length; i++) {
|
|
2633
|
+
for (let j = i + 1; j < sections.length; j++) {
|
|
2634
|
+
let found = false;
|
|
2635
|
+
for (const pA of sections[i].paragraphs) {
|
|
2636
|
+
if (found) break;
|
|
2637
|
+
for (const pB of sections[j].paragraphs) {
|
|
2638
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2639
|
+
if (sim > 0.4) {
|
|
2640
|
+
pairs.push({
|
|
2641
|
+
headingA: sections[i].heading,
|
|
2642
|
+
headingB: sections[j].heading,
|
|
2643
|
+
similarity: Math.round(sim * 100),
|
|
2644
|
+
sample: pA.text.slice(0, 80)
|
|
2645
|
+
});
|
|
2646
|
+
found = true;
|
|
2647
|
+
break;
|
|
2648
|
+
}
|
|
2649
|
+
}
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
}
|
|
2653
|
+
return pairs;
|
|
2654
|
+
}
|
|
2655
|
+
function checkDuplicateContent(data) {
|
|
2656
|
+
const findings = [];
|
|
2657
|
+
const pages = [];
|
|
2658
|
+
if (data.homepage) {
|
|
2659
|
+
pages.push({ html: data.homepage.text, url: data.homepage.finalUrl || `https://${data.domain}/` });
|
|
2660
|
+
}
|
|
2661
|
+
if (data.blogSample) {
|
|
2662
|
+
for (const page of data.blogSample) {
|
|
2663
|
+
pages.push({ html: page.text, url: page.finalUrl || "" });
|
|
2664
|
+
}
|
|
2665
|
+
}
|
|
2666
|
+
if (pages.length === 0) {
|
|
2667
|
+
findings.push({ severity: "critical", detail: "No pages available for duplicate content analysis" });
|
|
2668
|
+
return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score: 0, status: "fail", findings, fix_priority: "P1" };
|
|
2669
|
+
}
|
|
2670
|
+
let totalDupPages = 0;
|
|
2671
|
+
let totalDupPairs = 0;
|
|
2672
|
+
const dupDetails = [];
|
|
2673
|
+
for (const page of pages) {
|
|
2674
|
+
const pairs = findIntraPageDuplicates(page.html);
|
|
2675
|
+
if (pairs.length > 0) {
|
|
2676
|
+
totalDupPages++;
|
|
2677
|
+
totalDupPairs += pairs.length;
|
|
2678
|
+
dupDetails.push({ url: page.url, pairs });
|
|
2679
|
+
}
|
|
2680
|
+
}
|
|
2681
|
+
const dupRatio = totalDupPages / pages.length;
|
|
2682
|
+
let score;
|
|
2683
|
+
if (totalDupPairs === 0) {
|
|
2684
|
+
score = 10;
|
|
2685
|
+
findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no duplicate content blocks detected` });
|
|
2686
|
+
} else if (dupRatio <= 0.05 && totalDupPairs <= 2) {
|
|
2687
|
+
score = 9;
|
|
2688
|
+
findings.push({ severity: "info", detail: `${totalDupPairs} duplicate block pair(s) on ${totalDupPages} page(s) - minor` });
|
|
2689
|
+
} else if (dupRatio <= 0.1) {
|
|
2690
|
+
score = 7;
|
|
2691
|
+
findings.push({ severity: "low", detail: `${totalDupPairs} duplicate block pair(s) across ${totalDupPages} page(s)`, fix: "Rewrite duplicate sections to provide unique content in each" });
|
|
2692
|
+
} else if (dupRatio <= 0.2) {
|
|
2693
|
+
score = 5;
|
|
2694
|
+
findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Rewrite or remove repeated text blocks - LLMs may flag this as low-quality content" });
|
|
2695
|
+
} else if (dupRatio <= 0.4) {
|
|
2696
|
+
score = 3;
|
|
2697
|
+
findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) have significant duplicate content`, fix: "Widespread duplicate blocks reduce content authority - rewrite each section with unique angles" });
|
|
2698
|
+
} else {
|
|
2699
|
+
score = 0;
|
|
2700
|
+
findings.push({ severity: "high", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Severe content duplication across the site - LLMs will likely reduce citation authority" });
|
|
2701
|
+
}
|
|
2702
|
+
for (const dup of dupDetails.slice(0, 3)) {
|
|
2703
|
+
const shortUrl = dup.url.slice(0, 60);
|
|
2704
|
+
for (const pair of dup.pairs.slice(0, 2)) {
|
|
2705
|
+
findings.push({
|
|
2706
|
+
severity: "low",
|
|
2707
|
+
detail: `${shortUrl}: '${pair.headingA}' and '${pair.headingB}' share ${pair.similarity}% similar text ("${pair.sample}...")`,
|
|
2708
|
+
fix: `Rewrite one of these sections to eliminate duplicate content`
|
|
2709
|
+
});
|
|
2710
|
+
}
|
|
2711
|
+
}
|
|
2712
|
+
return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
|
|
2713
|
+
}
|
|
2714
|
+
function checkCrossPageDuplication(data) {
|
|
2715
|
+
const findings = [];
|
|
2716
|
+
const pages = [];
|
|
2717
|
+
if (data.homepage) {
|
|
2718
|
+
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
|
|
2719
|
+
}
|
|
2720
|
+
if (data.blogSample) {
|
|
2721
|
+
for (const page of data.blogSample) {
|
|
2722
|
+
pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
|
|
2723
|
+
}
|
|
2724
|
+
}
|
|
2725
|
+
if (pages.length <= 1) {
|
|
2726
|
+
findings.push({ severity: "info", detail: "Not enough pages to assess cross-page duplication" });
|
|
2727
|
+
return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score: 5, status: "partial", findings, fix_priority: "P3" };
|
|
2728
|
+
}
|
|
2729
|
+
const paragraphPageCount = /* @__PURE__ */ new Map();
|
|
2730
|
+
for (const page of pages) {
|
|
2731
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2732
|
+
for (const p of page.paragraphs) {
|
|
2733
|
+
const fp = [...p.shingles].slice(0, 5).join("|");
|
|
2734
|
+
if (!seen.has(fp)) {
|
|
2735
|
+
seen.add(fp);
|
|
2736
|
+
paragraphPageCount.set(fp, (paragraphPageCount.get(fp) || 0) + 1);
|
|
2737
|
+
}
|
|
2738
|
+
}
|
|
2739
|
+
}
|
|
2740
|
+
const boilerplateThreshold = Math.max(3, pages.length * 0.4);
|
|
2741
|
+
const siteBoilerprints = /* @__PURE__ */ new Set();
|
|
2742
|
+
for (const [fp, count] of paragraphPageCount) {
|
|
2743
|
+
if (count >= boilerplateThreshold) siteBoilerprints.add(fp);
|
|
2744
|
+
}
|
|
2745
|
+
const crossDupPairs = [];
|
|
2746
|
+
for (let i = 0; i < pages.length; i++) {
|
|
2747
|
+
for (let j = i + 1; j < pages.length; j++) {
|
|
2748
|
+
let dupCount = 0;
|
|
2749
|
+
let sample = "";
|
|
2750
|
+
for (const pA of pages[i].paragraphs) {
|
|
2751
|
+
const fpA = [...pA.shingles].slice(0, 5).join("|");
|
|
2752
|
+
if (siteBoilerprints.has(fpA)) continue;
|
|
2753
|
+
for (const pB of pages[j].paragraphs) {
|
|
2754
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2755
|
+
if (sim > 0.4) {
|
|
2756
|
+
dupCount++;
|
|
2757
|
+
if (!sample) sample = pA.text.slice(0, 80);
|
|
2758
|
+
break;
|
|
2759
|
+
}
|
|
2760
|
+
}
|
|
2761
|
+
}
|
|
2762
|
+
if (dupCount >= 2) {
|
|
2763
|
+
crossDupPairs.push({
|
|
2764
|
+
urlA: pages[i].url.slice(0, 60),
|
|
2765
|
+
urlB: pages[j].url.slice(0, 60),
|
|
2766
|
+
dupCount,
|
|
2767
|
+
sample
|
|
2768
|
+
});
|
|
2769
|
+
}
|
|
2770
|
+
}
|
|
2771
|
+
}
|
|
2772
|
+
const affectedUrls = /* @__PURE__ */ new Set();
|
|
2773
|
+
for (const pair of crossDupPairs) {
|
|
2774
|
+
affectedUrls.add(pair.urlA);
|
|
2775
|
+
affectedUrls.add(pair.urlB);
|
|
2776
|
+
}
|
|
2777
|
+
const affectedRatio = pages.length > 0 ? affectedUrls.size / pages.length : 0;
|
|
2778
|
+
const totalDupParagraphs = crossDupPairs.reduce((s, p) => s + p.dupCount, 0);
|
|
2779
|
+
let score;
|
|
2780
|
+
if (crossDupPairs.length === 0) {
|
|
2781
|
+
score = 10;
|
|
2782
|
+
findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no cross-page content duplication detected` });
|
|
2783
|
+
} else if (affectedRatio <= 0.05 && totalDupParagraphs <= 4) {
|
|
2784
|
+
score = 9;
|
|
2785
|
+
findings.push({ severity: "info", detail: `${totalDupParagraphs} shared paragraph(s) across ${affectedUrls.size} page(s) - minor` });
|
|
2786
|
+
} else if (affectedRatio <= 0.1) {
|
|
2787
|
+
score = 7;
|
|
2788
|
+
findings.push({ severity: "low", detail: `${totalDupParagraphs} shared paragraphs across ${affectedUrls.size} pages`, fix: "Rewrite shared content so each page provides a unique perspective" });
|
|
2789
|
+
} else if (affectedRatio <= 0.2) {
|
|
2790
|
+
score = 5;
|
|
2791
|
+
findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate paragraphs`, fix: "Significant cross-page duplication - AI engines may only index one version" });
|
|
2792
|
+
} else if (affectedRatio <= 0.4) {
|
|
2793
|
+
score = 3;
|
|
2794
|
+
findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) contain shared content blocks`, fix: "Widespread copy-paste content across pages reduces overall site authority" });
|
|
2795
|
+
} else {
|
|
2796
|
+
score = 0;
|
|
2797
|
+
findings.push({ severity: "high", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate content`, fix: "Severe cross-page duplication - AI engines will likely ignore redundant pages entirely" });
|
|
2798
|
+
}
|
|
2799
|
+
for (const pair of crossDupPairs.slice(0, 3)) {
|
|
2800
|
+
findings.push({
|
|
2801
|
+
severity: "low",
|
|
2802
|
+
detail: `${pair.dupCount} shared paragraph(s): ${pair.urlA} \u2194 ${pair.urlB} ("${pair.sample}...")`,
|
|
2803
|
+
fix: "Rewrite shared paragraphs so each page has unique content"
|
|
2804
|
+
});
|
|
2805
|
+
}
|
|
2806
|
+
return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
|
|
2807
|
+
}
|
|
2559
2808
|
function auditSiteFromData(data) {
|
|
2560
2809
|
const topicCoherence = checkTopicCoherence(data);
|
|
2561
2810
|
const cannibalization = checkContentCannibalization(data, topicCoherence.score);
|
|
@@ -2594,7 +2843,10 @@ function auditSiteFromData(data) {
|
|
|
2594
2843
|
checkEvidencePackaging(data),
|
|
2595
2844
|
checkEntityDisambiguation(data),
|
|
2596
2845
|
checkExtractionFriction(data),
|
|
2597
|
-
checkImageContextAI(data)
|
|
2846
|
+
checkImageContextAI(data),
|
|
2847
|
+
// V3 criteria (#35-#36)
|
|
2848
|
+
checkDuplicateContent(data),
|
|
2849
|
+
checkCrossPageDuplication(data)
|
|
2598
2850
|
];
|
|
2599
2851
|
}
|
|
2600
2852
|
|
|
@@ -2612,11 +2864,11 @@ var WEIGHTS = {
|
|
|
2612
2864
|
// Information density per page
|
|
2613
2865
|
direct_answer_density: 0.05,
|
|
2614
2866
|
// Direct answers to queries
|
|
2615
|
-
qa_content_format: 0.
|
|
2867
|
+
qa_content_format: 0.04,
|
|
2616
2868
|
// Answer-shaped content structure
|
|
2617
|
-
query_answer_alignment: 0.
|
|
2869
|
+
query_answer_alignment: 0.04,
|
|
2618
2870
|
// Relevance to actual AI queries
|
|
2619
|
-
faq_section: 0.
|
|
2871
|
+
faq_section: 0.03,
|
|
2620
2872
|
// Structured Q&A pairs
|
|
2621
2873
|
// ─── Content Organization (~30%) ──────────────────────────────────────────
|
|
2622
2874
|
// HOW easily AI engines can extract and trust your content.
|
|
@@ -2664,8 +2916,13 @@ var WEIGHTS = {
|
|
|
2664
2916
|
// Clear entity boundaries
|
|
2665
2917
|
extraction_friction: 0.02,
|
|
2666
2918
|
// Sentence length, voice, jargon
|
|
2667
|
-
image_context_ai: 0.01
|
|
2919
|
+
image_context_ai: 0.01,
|
|
2668
2920
|
// Figure/figcaption, alt text quality
|
|
2921
|
+
// ─── V3 Criteria ────────────────────────────────────────────────────────
|
|
2922
|
+
duplicate_content: 0.05,
|
|
2923
|
+
// Duplicate text blocks within pages
|
|
2924
|
+
cross_page_duplication: 0.03
|
|
2925
|
+
// Same paragraphs copied across pages
|
|
2669
2926
|
};
|
|
2670
2927
|
function calculateOverallScore(criteria) {
|
|
2671
2928
|
let totalWeight = 0;
|
|
@@ -2778,7 +3035,9 @@ var PILLARS = {
|
|
|
2778
3035
|
"fact_density",
|
|
2779
3036
|
"citation_ready_writing",
|
|
2780
3037
|
"answer_first_placement",
|
|
2781
|
-
"evidence_packaging"
|
|
3038
|
+
"evidence_packaging",
|
|
3039
|
+
"duplicate_content",
|
|
3040
|
+
"cross_page_duplication"
|
|
2782
3041
|
],
|
|
2783
3042
|
"Content Structure": [
|
|
2784
3043
|
"direct_answer_density",
|
|
@@ -2843,6 +3102,8 @@ var CLIENT_NAMES = {
|
|
|
2843
3102
|
image_context_ai: "Image Context for AI",
|
|
2844
3103
|
schema_coverage: "Schema Coverage",
|
|
2845
3104
|
speakable_schema: "Speakable Schema",
|
|
3105
|
+
duplicate_content: "Duplicate Content Blocks",
|
|
3106
|
+
cross_page_duplication: "Cross-Page Duplicate Content",
|
|
2846
3107
|
content_cannibalization: "Content Cannibalization",
|
|
2847
3108
|
llms_txt: "llms.txt File",
|
|
2848
3109
|
robots_txt: "robots.txt for AI",
|
|
@@ -2860,10 +3121,12 @@ var PILLAR_WEIGHTS = {
|
|
|
2860
3121
|
citation_ready_writing: 0.04,
|
|
2861
3122
|
answer_first_placement: 0.03,
|
|
2862
3123
|
evidence_packaging: 0.03,
|
|
3124
|
+
duplicate_content: 0.05,
|
|
3125
|
+
cross_page_duplication: 0.03,
|
|
2863
3126
|
direct_answer_density: 0.05,
|
|
2864
|
-
qa_content_format: 0.
|
|
2865
|
-
query_answer_alignment: 0.
|
|
2866
|
-
faq_section: 0.
|
|
3127
|
+
qa_content_format: 0.04,
|
|
3128
|
+
query_answer_alignment: 0.04,
|
|
3129
|
+
faq_section: 0.03,
|
|
2867
3130
|
table_list_extractability: 0.03,
|
|
2868
3131
|
definition_patterns: 0.02,
|
|
2869
3132
|
entity_disambiguation: 0.02,
|
|
@@ -2896,6 +3159,8 @@ var CRITERION_EFFORT = {
|
|
|
2896
3159
|
citation_ready_writing: "Medium",
|
|
2897
3160
|
answer_first_placement: "Medium",
|
|
2898
3161
|
evidence_packaging: "Medium",
|
|
3162
|
+
duplicate_content: "Medium",
|
|
3163
|
+
cross_page_duplication: "Medium",
|
|
2899
3164
|
direct_answer_density: "Medium",
|
|
2900
3165
|
qa_content_format: "Medium",
|
|
2901
3166
|
query_answer_alignment: "Medium",
|
|
@@ -2951,6 +3216,8 @@ var FIX_DESCRIPTIONS = {
|
|
|
2951
3216
|
image_context_ai: "Wrap images in <figure>/<figcaption> with descriptive alt text.",
|
|
2952
3217
|
schema_coverage: "Extend structured data to inner pages (articles, services, products).",
|
|
2953
3218
|
speakable_schema: "Add SpeakableSpecification schema for voice assistant compatibility.",
|
|
3219
|
+
duplicate_content: "Rewrite duplicate text blocks so each section provides unique value.",
|
|
3220
|
+
cross_page_duplication: "Rewrite shared paragraphs across pages so each page has unique content.",
|
|
2954
3221
|
content_cannibalization: "Consolidate overlapping pages or differentiate titles and H1 headings.",
|
|
2955
3222
|
llms_txt: "Create a /llms.txt file describing your site for AI engines.",
|
|
2956
3223
|
robots_txt: "Update robots.txt to explicitly allow AI crawlers.",
|
|
@@ -3046,7 +3313,9 @@ var CRITERION_LABELS = {
|
|
|
3046
3313
|
"Evidence Packaging": "Evidence Packaging",
|
|
3047
3314
|
"Entity Disambiguation": "Entity Disambiguation",
|
|
3048
3315
|
"Extraction Friction Score": "Extraction Friction Score",
|
|
3049
|
-
"Image Context for AI": "Image Context for AI"
|
|
3316
|
+
"Image Context for AI": "Image Context for AI",
|
|
3317
|
+
"Duplicate Content Blocks": "Duplicate Content Blocks",
|
|
3318
|
+
"Cross-Page Duplicate Content": "Cross-Page Duplicate Content"
|
|
3050
3319
|
};
|
|
3051
3320
|
function scoreToStatus(score) {
|
|
3052
3321
|
if (score === 0) return "MISSING";
|
|
@@ -3141,9 +3410,9 @@ var CRITERION_WEIGHTS = {
|
|
|
3141
3410
|
content_depth: 0.07,
|
|
3142
3411
|
fact_density: 0.06,
|
|
3143
3412
|
direct_answer_density: 0.05,
|
|
3144
|
-
qa_content_format: 0.
|
|
3145
|
-
query_answer_alignment: 0.
|
|
3146
|
-
faq_section: 0.
|
|
3413
|
+
qa_content_format: 0.04,
|
|
3414
|
+
query_answer_alignment: 0.04,
|
|
3415
|
+
faq_section: 0.03,
|
|
3147
3416
|
// Content Organization (~30%)
|
|
3148
3417
|
entity_consistency: 0.05,
|
|
3149
3418
|
internal_linking: 0.04,
|
|
@@ -3172,7 +3441,10 @@ var CRITERION_WEIGHTS = {
|
|
|
3172
3441
|
evidence_packaging: 0.03,
|
|
3173
3442
|
entity_disambiguation: 0.02,
|
|
3174
3443
|
extraction_friction: 0.02,
|
|
3175
|
-
image_context_ai: 0.01
|
|
3444
|
+
image_context_ai: 0.01,
|
|
3445
|
+
// V3 Criteria
|
|
3446
|
+
duplicate_content: 0.05,
|
|
3447
|
+
cross_page_duplication: 0.03
|
|
3176
3448
|
};
|
|
3177
3449
|
var OPPORTUNITY_TEMPLATES = {
|
|
3178
3450
|
llms_txt: {
|
|
@@ -3295,6 +3567,16 @@ var OPPORTUNITY_TEMPLATES = {
|
|
|
3295
3567
|
effort: "Medium",
|
|
3296
3568
|
description: "Ensure every question-format heading (H2/H3) is followed by a direct answer paragraph. This pattern is ideal for AI engine snippet extraction."
|
|
3297
3569
|
},
|
|
3570
|
+
duplicate_content: {
|
|
3571
|
+
name: "Fix Duplicate Content Blocks",
|
|
3572
|
+
effort: "Medium",
|
|
3573
|
+
description: "Sections within pages contain identical or near-identical text. LLMs may flag this as low-quality or thin content, reducing citation authority. Rewrite duplicate blocks with unique angles."
|
|
3574
|
+
},
|
|
3575
|
+
cross_page_duplication: {
|
|
3576
|
+
name: "Eliminate Cross-Page Duplicate Content",
|
|
3577
|
+
effort: "Medium",
|
|
3578
|
+
description: "The same paragraphs appear on multiple pages. AI engines may only index one version and ignore the rest. Rewrite shared content so each page offers a unique perspective."
|
|
3579
|
+
},
|
|
3298
3580
|
content_cannibalization: {
|
|
3299
3581
|
name: "Resolve Content Cannibalization",
|
|
3300
3582
|
effort: "Medium",
|
|
@@ -3703,9 +3985,9 @@ var PAGE_CRITERIA = {
|
|
|
3703
3985
|
original_data: { weight: 0.1, label: "Original Data & Expert Content" },
|
|
3704
3986
|
fact_density: { weight: 0.06, label: "Fact & Data Density" },
|
|
3705
3987
|
direct_answer_density: { weight: 0.05, label: "Direct Answer Paragraphs" },
|
|
3706
|
-
qa_content_format: { weight: 0.
|
|
3707
|
-
query_answer_alignment: { weight: 0.
|
|
3708
|
-
faq_section: { weight: 0.
|
|
3988
|
+
qa_content_format: { weight: 0.04, label: "Q&A Content Format" },
|
|
3989
|
+
query_answer_alignment: { weight: 0.04, label: "Query-Answer Alignment" },
|
|
3990
|
+
faq_section: { weight: 0.03, label: "FAQ Section Content" },
|
|
3709
3991
|
// Content Organization
|
|
3710
3992
|
content_freshness: { weight: 0.04, label: "Content Freshness Signals" },
|
|
3711
3993
|
schema_markup: { weight: 0.03, label: "Schema.org Structured Data" },
|
|
@@ -3722,7 +4004,8 @@ var PAGE_CRITERIA = {
|
|
|
3722
4004
|
evidence_packaging: { weight: 0.03, label: "Evidence Packaging" },
|
|
3723
4005
|
entity_disambiguation: { weight: 0.02, label: "Entity Disambiguation" },
|
|
3724
4006
|
extraction_friction: { weight: 0.02, label: "Extraction Friction Score" },
|
|
3725
|
-
image_context_ai: { weight: 0.01, label: "Image Context for AI" }
|
|
4007
|
+
image_context_ai: { weight: 0.01, label: "Image Context for AI" },
|
|
4008
|
+
duplicate_content: { weight: 0.05, label: "Duplicate Content Blocks" }
|
|
3726
4009
|
};
|
|
3727
4010
|
function extractJsonLdBlocks(html) {
|
|
3728
4011
|
const blocks = [];
|
|
@@ -4171,6 +4454,48 @@ function scoreImageContextAI(html) {
|
|
|
4171
4454
|
if (contextualImages.length > 0) score += 3;
|
|
4172
4455
|
return cap(score, 10);
|
|
4173
4456
|
}
|
|
4457
|
+
function scoreDuplicateContent(html) {
|
|
4458
|
+
return scoreDuplicateContentDetailed(html).score;
|
|
4459
|
+
}
|
|
4460
|
+
function scoreDuplicateContentDetailed(html) {
|
|
4461
|
+
const sections = extractDuplicateContentSections(html);
|
|
4462
|
+
if (sections.length < 2) return { score: 10, duplicates: [] };
|
|
4463
|
+
const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
|
|
4464
|
+
const duplicates = [];
|
|
4465
|
+
let dupParagraphCount = 0;
|
|
4466
|
+
for (let i = 0; i < sections.length; i++) {
|
|
4467
|
+
for (let j = i + 1; j < sections.length; j++) {
|
|
4468
|
+
for (const pA of sections[i].paragraphs) {
|
|
4469
|
+
for (const pB of sections[j].paragraphs) {
|
|
4470
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
4471
|
+
if (sim > 0.4) {
|
|
4472
|
+
dupParagraphCount++;
|
|
4473
|
+
duplicates.push({
|
|
4474
|
+
headingA: sections[i].heading,
|
|
4475
|
+
headingB: sections[j].heading,
|
|
4476
|
+
similarity: Math.round(sim * 100),
|
|
4477
|
+
sample: pA.text.slice(0, 80)
|
|
4478
|
+
});
|
|
4479
|
+
break;
|
|
4480
|
+
}
|
|
4481
|
+
}
|
|
4482
|
+
}
|
|
4483
|
+
}
|
|
4484
|
+
}
|
|
4485
|
+
if (dupParagraphCount === 0) return { score: 10, duplicates: [] };
|
|
4486
|
+
const dupRatio = totalParagraphs > 0 ? dupParagraphCount / totalParagraphs : 0;
|
|
4487
|
+
let score;
|
|
4488
|
+
if (dupParagraphCount === 1 && dupRatio <= 0.05) {
|
|
4489
|
+
score = 6;
|
|
4490
|
+
} else if (dupParagraphCount === 1) {
|
|
4491
|
+
score = 4;
|
|
4492
|
+
} else if (dupParagraphCount === 2) {
|
|
4493
|
+
score = 2;
|
|
4494
|
+
} else {
|
|
4495
|
+
score = 0;
|
|
4496
|
+
}
|
|
4497
|
+
return { score, duplicates };
|
|
4498
|
+
}
|
|
4174
4499
|
var SCORING_FUNCTIONS = {
|
|
4175
4500
|
schema_markup: scoreSchemaMarkup,
|
|
4176
4501
|
qa_content_format: scoreQAFormat,
|
|
@@ -4191,7 +4516,8 @@ var SCORING_FUNCTIONS = {
|
|
|
4191
4516
|
evidence_packaging: scoreEvidencePackaging,
|
|
4192
4517
|
entity_disambiguation: scoreEntityDisambiguation,
|
|
4193
4518
|
extraction_friction: scoreExtractionFriction,
|
|
4194
|
-
image_context_ai: scoreImageContextAI
|
|
4519
|
+
image_context_ai: scoreImageContextAI,
|
|
4520
|
+
duplicate_content: scoreDuplicateContent
|
|
4195
4521
|
};
|
|
4196
4522
|
function scorePage(html, url) {
|
|
4197
4523
|
let totalWeight = 0;
|
|
@@ -4205,6 +4531,11 @@ function scorePage(html, url) {
|
|
|
4205
4531
|
totalWeight += weight;
|
|
4206
4532
|
}
|
|
4207
4533
|
let aeoScore = totalWeight === 0 ? 0 : Math.round(weightedSum / totalWeight);
|
|
4534
|
+
const dupScore = criterionScores.find((c) => c.criterion === "duplicate_content")?.score ?? 10;
|
|
4535
|
+
if (dupScore <= 6) {
|
|
4536
|
+
const dupCap = 35 + dupScore * 5;
|
|
4537
|
+
aeoScore = Math.min(aeoScore, dupCap);
|
|
4538
|
+
}
|
|
4208
4539
|
const scoreCapped = aeoScore > 75;
|
|
4209
4540
|
if (scoreCapped) aeoScore = 75;
|
|
4210
4541
|
return { aeoScore, criterionScores, scoreCapped };
|
|
@@ -4410,6 +4741,15 @@ function checkHasCitationReadyContent(html) {
|
|
|
4410
4741
|
}
|
|
4411
4742
|
return null;
|
|
4412
4743
|
}
|
|
4744
|
+
function checkDuplicateContentBlocks(html) {
|
|
4745
|
+
const { score, duplicates } = scoreDuplicateContentDetailed(html);
|
|
4746
|
+
if (score <= 6 && duplicates.length > 0) {
|
|
4747
|
+
const first = duplicates[0];
|
|
4748
|
+
const label = duplicates.length === 1 ? `Duplicate content: '${first.headingA}' and '${first.headingB}' share ${first.similarity}% similar text ("${first.sample}...")` : `${duplicates.length} duplicate blocks found (e.g. '${first.headingA}' and '${first.headingB}' \u2014 "${first.sample}...")`;
|
|
4749
|
+
return { check: "duplicate-content", label, severity: score <= 3 ? "error" : "warning" };
|
|
4750
|
+
}
|
|
4751
|
+
return null;
|
|
4752
|
+
}
|
|
4413
4753
|
function analyzePage(html, url, category) {
|
|
4414
4754
|
const title = extractTitle(html);
|
|
4415
4755
|
const textContent = getTextContent2(html);
|
|
@@ -4428,7 +4768,8 @@ function analyzePage(html, url, category) {
|
|
|
4428
4768
|
checkImagesMissingAlt(html),
|
|
4429
4769
|
checkNoInternalLinks(html, url),
|
|
4430
4770
|
checkNoAnswerBlock(html),
|
|
4431
|
-
checkNoEvidence(html, url)
|
|
4771
|
+
checkNoEvidence(html, url),
|
|
4772
|
+
checkDuplicateContentBlocks(html)
|
|
4432
4773
|
];
|
|
4433
4774
|
for (const result of issueChecks) {
|
|
4434
4775
|
if (result) issues.push(result);
|
|
@@ -5060,6 +5401,10 @@ function printSummary(result) {
|
|
|
5060
5401
|
const issueLabel = issueCount === 0 ? "0 issues" : issueCount === 1 ? "1 issue" : `${issueCount} issues`;
|
|
5061
5402
|
const aeoLabel = page.aeoScore != null ? ` [AEO: ${page.aeoScore}]` : "";
|
|
5062
5403
|
log(` ${cat.padEnd(10)} ${page.url.padEnd(50)} ${issueLabel}${aeoLabel}`);
|
|
5404
|
+
const dupIssue = page.issues.find((i) => i.check === "duplicate-content");
|
|
5405
|
+
if (dupIssue) {
|
|
5406
|
+
log(` \u26A0 ${dupIssue.label}`);
|
|
5407
|
+
}
|
|
5063
5408
|
}
|
|
5064
5409
|
const scored = result.pagesReviewed.filter((p) => p.aeoScore != null);
|
|
5065
5410
|
if (scored.length > 0) {
|