aeorank 3.0.3 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -462,6 +462,75 @@ function detectParkedDomain(bodySnippet) {
462
462
  return { isParked: false };
463
463
  }
464
464
 
465
+ // src/duplicate-content.ts
466
+ var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
467
+ var MIN_SUBSTANTIVE_WORDS = 15;
468
+ var MAX_METADATA_WORDS = 24;
469
+ var MAX_METADATA_LABEL_WORDS = 4;
470
+ function normalizeParagraphText(htmlFragment) {
471
+ return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
472
+ }
473
+ function tokenize(text) {
474
+ return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
475
+ }
476
+ function isBoilerplateParagraph(text, words) {
477
+ if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
478
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
479
+ return false;
480
+ }
481
+ function isMetadataParagraph(text, words) {
482
+ const labelMatch = text.match(/^([^:]{1,60}):\s+/);
483
+ if (!labelMatch) return false;
484
+ const labelWords = tokenize(labelMatch[1]).length;
485
+ return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
486
+ }
487
+ function buildShinglesFromTokens(words, n = 4) {
488
+ const shingles = /* @__PURE__ */ new Set();
489
+ for (let i = 0; i <= words.length - n; i++) {
490
+ shingles.add(words.slice(i, i + n).join(" "));
491
+ }
492
+ return shingles;
493
+ }
494
+ function createParagraph(htmlFragment) {
495
+ const text = normalizeParagraphText(htmlFragment);
496
+ const words = tokenize(text);
497
+ if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
498
+ if (isBoilerplateParagraph(text, words.length)) return null;
499
+ if (isMetadataParagraph(text, words.length)) return null;
500
+ const shingles = buildShinglesFromTokens(words);
501
+ if (shingles.size < 3) return null;
502
+ return { text, shingles };
503
+ }
504
+ function stripNonContentHtml(html) {
505
+ return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
506
+ }
507
+ function extractDuplicateContentParagraphs(html) {
508
+ const cleaned = stripNonContentHtml(html);
509
+ const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
510
+ return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
511
+ }
512
+ function extractDuplicateContentSections(html) {
513
+ const cleaned = stripNonContentHtml(html);
514
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
515
+ const sections = [];
516
+ for (const part of parts) {
517
+ const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
518
+ const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
519
+ const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
520
+ if (paragraphs.length > 0) sections.push({ heading, paragraphs });
521
+ }
522
+ return sections;
523
+ }
524
+ function shingleJaccardSimilarity(a, b) {
525
+ if (a.size === 0 && b.size === 0) return 0;
526
+ let intersection = 0;
527
+ for (const shingle of a) {
528
+ if (b.has(shingle)) intersection++;
529
+ }
530
+ const union = a.size + b.size - intersection;
531
+ return union === 0 ? 0 : intersection / union;
532
+ }
533
+
465
534
  // src/site-crawler.ts
466
535
  async function fetchText(url) {
467
536
  try {
@@ -2942,6 +3011,186 @@ function checkImageContextAI(data) {
2942
3011
  }
2943
3012
  return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
2944
3013
  }
3014
+ function findIntraPageDuplicates(html) {
3015
+ const sections = extractDuplicateContentSections(html);
3016
+ if (sections.length < 2) return [];
3017
+ const pairs = [];
3018
+ for (let i = 0; i < sections.length; i++) {
3019
+ for (let j = i + 1; j < sections.length; j++) {
3020
+ let found = false;
3021
+ for (const pA of sections[i].paragraphs) {
3022
+ if (found) break;
3023
+ for (const pB of sections[j].paragraphs) {
3024
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
3025
+ if (sim > 0.4) {
3026
+ pairs.push({
3027
+ headingA: sections[i].heading,
3028
+ headingB: sections[j].heading,
3029
+ similarity: Math.round(sim * 100),
3030
+ sample: pA.text.slice(0, 80)
3031
+ });
3032
+ found = true;
3033
+ break;
3034
+ }
3035
+ }
3036
+ }
3037
+ }
3038
+ }
3039
+ return pairs;
3040
+ }
3041
+ function checkDuplicateContent(data) {
3042
+ const findings = [];
3043
+ const pages = [];
3044
+ if (data.homepage) {
3045
+ pages.push({ html: data.homepage.text, url: data.homepage.finalUrl || `https://${data.domain}/` });
3046
+ }
3047
+ if (data.blogSample) {
3048
+ for (const page of data.blogSample) {
3049
+ pages.push({ html: page.text, url: page.finalUrl || "" });
3050
+ }
3051
+ }
3052
+ if (pages.length === 0) {
3053
+ findings.push({ severity: "critical", detail: "No pages available for duplicate content analysis" });
3054
+ return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score: 0, status: "fail", findings, fix_priority: "P1" };
3055
+ }
3056
+ let totalDupPages = 0;
3057
+ let totalDupPairs = 0;
3058
+ const dupDetails = [];
3059
+ for (const page of pages) {
3060
+ const pairs = findIntraPageDuplicates(page.html);
3061
+ if (pairs.length > 0) {
3062
+ totalDupPages++;
3063
+ totalDupPairs += pairs.length;
3064
+ dupDetails.push({ url: page.url, pairs });
3065
+ }
3066
+ }
3067
+ const dupRatio = totalDupPages / pages.length;
3068
+ let score;
3069
+ if (totalDupPairs === 0) {
3070
+ score = 10;
3071
+ findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no duplicate content blocks detected` });
3072
+ } else if (dupRatio <= 0.05 && totalDupPairs <= 2) {
3073
+ score = 9;
3074
+ findings.push({ severity: "info", detail: `${totalDupPairs} duplicate block pair(s) on ${totalDupPages} page(s) - minor` });
3075
+ } else if (dupRatio <= 0.1) {
3076
+ score = 7;
3077
+ findings.push({ severity: "low", detail: `${totalDupPairs} duplicate block pair(s) across ${totalDupPages} page(s)`, fix: "Rewrite duplicate sections to provide unique content in each" });
3078
+ } else if (dupRatio <= 0.2) {
3079
+ score = 5;
3080
+ findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Rewrite or remove repeated text blocks - LLMs may flag this as low-quality content" });
3081
+ } else if (dupRatio <= 0.4) {
3082
+ score = 3;
3083
+ findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) have significant duplicate content`, fix: "Widespread duplicate blocks reduce content authority - rewrite each section with unique angles" });
3084
+ } else {
3085
+ score = 0;
3086
+ findings.push({ severity: "high", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Severe content duplication across the site - LLMs will likely reduce citation authority" });
3087
+ }
3088
+ for (const dup of dupDetails.slice(0, 3)) {
3089
+ const shortUrl = dup.url.slice(0, 60);
3090
+ for (const pair of dup.pairs.slice(0, 2)) {
3091
+ findings.push({
3092
+ severity: "low",
3093
+ detail: `${shortUrl}: '${pair.headingA}' and '${pair.headingB}' share ${pair.similarity}% similar text ("${pair.sample}...")`,
3094
+ fix: `Rewrite one of these sections to eliminate duplicate content`
3095
+ });
3096
+ }
3097
+ }
3098
+ return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
3099
+ }
3100
+ function checkCrossPageDuplication(data) {
3101
+ const findings = [];
3102
+ const pages = [];
3103
+ if (data.homepage) {
3104
+ pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
3105
+ }
3106
+ if (data.blogSample) {
3107
+ for (const page of data.blogSample) {
3108
+ pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
3109
+ }
3110
+ }
3111
+ if (pages.length <= 1) {
3112
+ findings.push({ severity: "info", detail: "Not enough pages to assess cross-page duplication" });
3113
+ return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score: 5, status: "partial", findings, fix_priority: "P3" };
3114
+ }
3115
+ const paragraphPageCount = /* @__PURE__ */ new Map();
3116
+ for (const page of pages) {
3117
+ const seen = /* @__PURE__ */ new Set();
3118
+ for (const p of page.paragraphs) {
3119
+ const fp = [...p.shingles].slice(0, 5).join("|");
3120
+ if (!seen.has(fp)) {
3121
+ seen.add(fp);
3122
+ paragraphPageCount.set(fp, (paragraphPageCount.get(fp) || 0) + 1);
3123
+ }
3124
+ }
3125
+ }
3126
+ const boilerplateThreshold = Math.max(3, pages.length * 0.4);
3127
+ const siteBoilerprints = /* @__PURE__ */ new Set();
3128
+ for (const [fp, count] of paragraphPageCount) {
3129
+ if (count >= boilerplateThreshold) siteBoilerprints.add(fp);
3130
+ }
3131
+ const crossDupPairs = [];
3132
+ for (let i = 0; i < pages.length; i++) {
3133
+ for (let j = i + 1; j < pages.length; j++) {
3134
+ let dupCount = 0;
3135
+ let sample = "";
3136
+ for (const pA of pages[i].paragraphs) {
3137
+ const fpA = [...pA.shingles].slice(0, 5).join("|");
3138
+ if (siteBoilerprints.has(fpA)) continue;
3139
+ for (const pB of pages[j].paragraphs) {
3140
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
3141
+ if (sim > 0.4) {
3142
+ dupCount++;
3143
+ if (!sample) sample = pA.text.slice(0, 80);
3144
+ break;
3145
+ }
3146
+ }
3147
+ }
3148
+ if (dupCount >= 2) {
3149
+ crossDupPairs.push({
3150
+ urlA: pages[i].url.slice(0, 60),
3151
+ urlB: pages[j].url.slice(0, 60),
3152
+ dupCount,
3153
+ sample
3154
+ });
3155
+ }
3156
+ }
3157
+ }
3158
+ const affectedUrls = /* @__PURE__ */ new Set();
3159
+ for (const pair of crossDupPairs) {
3160
+ affectedUrls.add(pair.urlA);
3161
+ affectedUrls.add(pair.urlB);
3162
+ }
3163
+ const affectedRatio = pages.length > 0 ? affectedUrls.size / pages.length : 0;
3164
+ const totalDupParagraphs = crossDupPairs.reduce((s, p) => s + p.dupCount, 0);
3165
+ let score;
3166
+ if (crossDupPairs.length === 0) {
3167
+ score = 10;
3168
+ findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no cross-page content duplication detected` });
3169
+ } else if (affectedRatio <= 0.05 && totalDupParagraphs <= 4) {
3170
+ score = 9;
3171
+ findings.push({ severity: "info", detail: `${totalDupParagraphs} shared paragraph(s) across ${affectedUrls.size} page(s) - minor` });
3172
+ } else if (affectedRatio <= 0.1) {
3173
+ score = 7;
3174
+ findings.push({ severity: "low", detail: `${totalDupParagraphs} shared paragraphs across ${affectedUrls.size} pages`, fix: "Rewrite shared content so each page provides a unique perspective" });
3175
+ } else if (affectedRatio <= 0.2) {
3176
+ score = 5;
3177
+ findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate paragraphs`, fix: "Significant cross-page duplication - AI engines may only index one version" });
3178
+ } else if (affectedRatio <= 0.4) {
3179
+ score = 3;
3180
+ findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) contain shared content blocks`, fix: "Widespread copy-paste content across pages reduces overall site authority" });
3181
+ } else {
3182
+ score = 0;
3183
+ findings.push({ severity: "high", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate content`, fix: "Severe cross-page duplication - AI engines will likely ignore redundant pages entirely" });
3184
+ }
3185
+ for (const pair of crossDupPairs.slice(0, 3)) {
3186
+ findings.push({
3187
+ severity: "low",
3188
+ detail: `${pair.dupCount} shared paragraph(s): ${pair.urlA} \u2194 ${pair.urlB} ("${pair.sample}...")`,
3189
+ fix: "Rewrite shared paragraphs so each page has unique content"
3190
+ });
3191
+ }
3192
+ return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
3193
+ }
2945
3194
  function auditSiteFromData(data) {
2946
3195
  const topicCoherence = checkTopicCoherence(data);
2947
3196
  const cannibalization = checkContentCannibalization(data, topicCoherence.score);
@@ -2980,7 +3229,10 @@ function auditSiteFromData(data) {
2980
3229
  checkEvidencePackaging(data),
2981
3230
  checkEntityDisambiguation(data),
2982
3231
  checkExtractionFriction(data),
2983
- checkImageContextAI(data)
3232
+ checkImageContextAI(data),
3233
+ // V3 criteria (#35-#36)
3234
+ checkDuplicateContent(data),
3235
+ checkCrossPageDuplication(data)
2984
3236
  ];
2985
3237
  }
2986
3238
  async function auditSite(targetUrl) {
@@ -3004,11 +3256,11 @@ var WEIGHTS = {
3004
3256
  // Information density per page
3005
3257
  direct_answer_density: 0.05,
3006
3258
  // Direct answers to queries
3007
- qa_content_format: 0.05,
3259
+ qa_content_format: 0.04,
3008
3260
  // Answer-shaped content structure
3009
- query_answer_alignment: 0.05,
3261
+ query_answer_alignment: 0.04,
3010
3262
  // Relevance to actual AI queries
3011
- faq_section: 0.04,
3263
+ faq_section: 0.03,
3012
3264
  // Structured Q&A pairs
3013
3265
  // ─── Content Organization (~30%) ──────────────────────────────────────────
3014
3266
  // HOW easily AI engines can extract and trust your content.
@@ -3056,8 +3308,13 @@ var WEIGHTS = {
3056
3308
  // Clear entity boundaries
3057
3309
  extraction_friction: 0.02,
3058
3310
  // Sentence length, voice, jargon
3059
- image_context_ai: 0.01
3311
+ image_context_ai: 0.01,
3060
3312
  // Figure/figcaption, alt text quality
3313
+ // ─── V3 Criteria ────────────────────────────────────────────────────────
3314
+ duplicate_content: 0.05,
3315
+ // Duplicate text blocks within pages
3316
+ cross_page_duplication: 0.03
3317
+ // Same paragraphs copied across pages
3061
3318
  };
3062
3319
  function calculateOverallScore(criteria) {
3063
3320
  let totalWeight = 0;
@@ -3187,7 +3444,9 @@ var PILLARS = {
3187
3444
  "fact_density",
3188
3445
  "citation_ready_writing",
3189
3446
  "answer_first_placement",
3190
- "evidence_packaging"
3447
+ "evidence_packaging",
3448
+ "duplicate_content",
3449
+ "cross_page_duplication"
3191
3450
  ],
3192
3451
  "Content Structure": [
3193
3452
  "direct_answer_density",
@@ -3252,6 +3511,8 @@ var CLIENT_NAMES = {
3252
3511
  image_context_ai: "Image Context for AI",
3253
3512
  schema_coverage: "Schema Coverage",
3254
3513
  speakable_schema: "Speakable Schema",
3514
+ duplicate_content: "Duplicate Content Blocks",
3515
+ cross_page_duplication: "Cross-Page Duplicate Content",
3255
3516
  content_cannibalization: "Content Cannibalization",
3256
3517
  llms_txt: "llms.txt File",
3257
3518
  robots_txt: "robots.txt for AI",
@@ -3269,10 +3530,12 @@ var PILLAR_WEIGHTS = {
3269
3530
  citation_ready_writing: 0.04,
3270
3531
  answer_first_placement: 0.03,
3271
3532
  evidence_packaging: 0.03,
3533
+ duplicate_content: 0.05,
3534
+ cross_page_duplication: 0.03,
3272
3535
  direct_answer_density: 0.05,
3273
- qa_content_format: 0.05,
3274
- query_answer_alignment: 0.05,
3275
- faq_section: 0.04,
3536
+ qa_content_format: 0.04,
3537
+ query_answer_alignment: 0.04,
3538
+ faq_section: 0.03,
3276
3539
  table_list_extractability: 0.03,
3277
3540
  definition_patterns: 0.02,
3278
3541
  entity_disambiguation: 0.02,
@@ -3305,6 +3568,8 @@ var CRITERION_EFFORT = {
3305
3568
  citation_ready_writing: "Medium",
3306
3569
  answer_first_placement: "Medium",
3307
3570
  evidence_packaging: "Medium",
3571
+ duplicate_content: "Medium",
3572
+ cross_page_duplication: "Medium",
3308
3573
  direct_answer_density: "Medium",
3309
3574
  qa_content_format: "Medium",
3310
3575
  query_answer_alignment: "Medium",
@@ -3360,6 +3625,8 @@ var FIX_DESCRIPTIONS = {
3360
3625
  image_context_ai: "Wrap images in <figure>/<figcaption> with descriptive alt text.",
3361
3626
  schema_coverage: "Extend structured data to inner pages (articles, services, products).",
3362
3627
  speakable_schema: "Add SpeakableSpecification schema for voice assistant compatibility.",
3628
+ duplicate_content: "Rewrite duplicate text blocks so each section provides unique value.",
3629
+ cross_page_duplication: "Rewrite shared paragraphs across pages so each page has unique content.",
3363
3630
  content_cannibalization: "Consolidate overlapping pages or differentiate titles and H1 headings.",
3364
3631
  llms_txt: "Create a /llms.txt file describing your site for AI engines.",
3365
3632
  robots_txt: "Update robots.txt to explicitly allow AI crawlers.",
@@ -3455,7 +3722,9 @@ var CRITERION_LABELS = {
3455
3722
  "Evidence Packaging": "Evidence Packaging",
3456
3723
  "Entity Disambiguation": "Entity Disambiguation",
3457
3724
  "Extraction Friction Score": "Extraction Friction Score",
3458
- "Image Context for AI": "Image Context for AI"
3725
+ "Image Context for AI": "Image Context for AI",
3726
+ "Duplicate Content Blocks": "Duplicate Content Blocks",
3727
+ "Cross-Page Duplicate Content": "Cross-Page Duplicate Content"
3459
3728
  };
3460
3729
  function scoreToStatus(score) {
3461
3730
  if (score === 0) return "MISSING";
@@ -3550,9 +3819,9 @@ var CRITERION_WEIGHTS = {
3550
3819
  content_depth: 0.07,
3551
3820
  fact_density: 0.06,
3552
3821
  direct_answer_density: 0.05,
3553
- qa_content_format: 0.05,
3554
- query_answer_alignment: 0.05,
3555
- faq_section: 0.04,
3822
+ qa_content_format: 0.04,
3823
+ query_answer_alignment: 0.04,
3824
+ faq_section: 0.03,
3556
3825
  // Content Organization (~30%)
3557
3826
  entity_consistency: 0.05,
3558
3827
  internal_linking: 0.04,
@@ -3581,7 +3850,10 @@ var CRITERION_WEIGHTS = {
3581
3850
  evidence_packaging: 0.03,
3582
3851
  entity_disambiguation: 0.02,
3583
3852
  extraction_friction: 0.02,
3584
- image_context_ai: 0.01
3853
+ image_context_ai: 0.01,
3854
+ // V3 Criteria
3855
+ duplicate_content: 0.05,
3856
+ cross_page_duplication: 0.03
3585
3857
  };
3586
3858
  var OPPORTUNITY_TEMPLATES = {
3587
3859
  llms_txt: {
@@ -3704,6 +3976,16 @@ var OPPORTUNITY_TEMPLATES = {
3704
3976
  effort: "Medium",
3705
3977
  description: "Ensure every question-format heading (H2/H3) is followed by a direct answer paragraph. This pattern is ideal for AI engine snippet extraction."
3706
3978
  },
3979
+ duplicate_content: {
3980
+ name: "Fix Duplicate Content Blocks",
3981
+ effort: "Medium",
3982
+ description: "Sections within pages contain identical or near-identical text. LLMs may flag this as low-quality or thin content, reducing citation authority. Rewrite duplicate blocks with unique angles."
3983
+ },
3984
+ cross_page_duplication: {
3985
+ name: "Eliminate Cross-Page Duplicate Content",
3986
+ effort: "Medium",
3987
+ description: "The same paragraphs appear on multiple pages. AI engines may only index one version and ignore the rest. Rewrite shared content so each page offers a unique perspective."
3988
+ },
3707
3989
  content_cannibalization: {
3708
3990
  name: "Resolve Content Cannibalization",
3709
3991
  effort: "Medium",
@@ -4112,9 +4394,9 @@ var PAGE_CRITERIA = {
4112
4394
  original_data: { weight: 0.1, label: "Original Data & Expert Content" },
4113
4395
  fact_density: { weight: 0.06, label: "Fact & Data Density" },
4114
4396
  direct_answer_density: { weight: 0.05, label: "Direct Answer Paragraphs" },
4115
- qa_content_format: { weight: 0.05, label: "Q&A Content Format" },
4116
- query_answer_alignment: { weight: 0.05, label: "Query-Answer Alignment" },
4117
- faq_section: { weight: 0.04, label: "FAQ Section Content" },
4397
+ qa_content_format: { weight: 0.04, label: "Q&A Content Format" },
4398
+ query_answer_alignment: { weight: 0.04, label: "Query-Answer Alignment" },
4399
+ faq_section: { weight: 0.03, label: "FAQ Section Content" },
4118
4400
  // Content Organization
4119
4401
  content_freshness: { weight: 0.04, label: "Content Freshness Signals" },
4120
4402
  schema_markup: { weight: 0.03, label: "Schema.org Structured Data" },
@@ -4131,7 +4413,8 @@ var PAGE_CRITERIA = {
4131
4413
  evidence_packaging: { weight: 0.03, label: "Evidence Packaging" },
4132
4414
  entity_disambiguation: { weight: 0.02, label: "Entity Disambiguation" },
4133
4415
  extraction_friction: { weight: 0.02, label: "Extraction Friction Score" },
4134
- image_context_ai: { weight: 0.01, label: "Image Context for AI" }
4416
+ image_context_ai: { weight: 0.01, label: "Image Context for AI" },
4417
+ duplicate_content: { weight: 0.05, label: "Duplicate Content Blocks" }
4135
4418
  };
4136
4419
  function extractJsonLdBlocks(html) {
4137
4420
  const blocks = [];
@@ -4580,6 +4863,48 @@ function scoreImageContextAI(html) {
4580
4863
  if (contextualImages.length > 0) score += 3;
4581
4864
  return cap(score, 10);
4582
4865
  }
4866
+ function scoreDuplicateContent(html) {
4867
+ return scoreDuplicateContentDetailed(html).score;
4868
+ }
4869
+ function scoreDuplicateContentDetailed(html) {
4870
+ const sections = extractDuplicateContentSections(html);
4871
+ if (sections.length < 2) return { score: 10, duplicates: [] };
4872
+ const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
4873
+ const duplicates = [];
4874
+ let dupParagraphCount = 0;
4875
+ for (let i = 0; i < sections.length; i++) {
4876
+ for (let j = i + 1; j < sections.length; j++) {
4877
+ for (const pA of sections[i].paragraphs) {
4878
+ for (const pB of sections[j].paragraphs) {
4879
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
4880
+ if (sim > 0.4) {
4881
+ dupParagraphCount++;
4882
+ duplicates.push({
4883
+ headingA: sections[i].heading,
4884
+ headingB: sections[j].heading,
4885
+ similarity: Math.round(sim * 100),
4886
+ sample: pA.text.slice(0, 80)
4887
+ });
4888
+ break;
4889
+ }
4890
+ }
4891
+ }
4892
+ }
4893
+ }
4894
+ if (dupParagraphCount === 0) return { score: 10, duplicates: [] };
4895
+ const dupRatio = totalParagraphs > 0 ? dupParagraphCount / totalParagraphs : 0;
4896
+ let score;
4897
+ if (dupParagraphCount === 1 && dupRatio <= 0.05) {
4898
+ score = 6;
4899
+ } else if (dupParagraphCount === 1) {
4900
+ score = 4;
4901
+ } else if (dupParagraphCount === 2) {
4902
+ score = 2;
4903
+ } else {
4904
+ score = 0;
4905
+ }
4906
+ return { score, duplicates };
4907
+ }
4583
4908
  var SCORING_FUNCTIONS = {
4584
4909
  schema_markup: scoreSchemaMarkup,
4585
4910
  qa_content_format: scoreQAFormat,
@@ -4600,7 +4925,8 @@ var SCORING_FUNCTIONS = {
4600
4925
  evidence_packaging: scoreEvidencePackaging,
4601
4926
  entity_disambiguation: scoreEntityDisambiguation,
4602
4927
  extraction_friction: scoreExtractionFriction,
4603
- image_context_ai: scoreImageContextAI
4928
+ image_context_ai: scoreImageContextAI,
4929
+ duplicate_content: scoreDuplicateContent
4604
4930
  };
4605
4931
  function scorePage(html, url) {
4606
4932
  let totalWeight = 0;
@@ -4614,6 +4940,11 @@ function scorePage(html, url) {
4614
4940
  totalWeight += weight;
4615
4941
  }
4616
4942
  let aeoScore = totalWeight === 0 ? 0 : Math.round(weightedSum / totalWeight);
4943
+ const dupScore = criterionScores.find((c) => c.criterion === "duplicate_content")?.score ?? 10;
4944
+ if (dupScore <= 6) {
4945
+ const dupCap = 35 + dupScore * 5;
4946
+ aeoScore = Math.min(aeoScore, dupCap);
4947
+ }
4617
4948
  const scoreCapped = aeoScore > 75;
4618
4949
  if (scoreCapped) aeoScore = 75;
4619
4950
  return { aeoScore, criterionScores, scoreCapped };
@@ -4833,6 +5164,15 @@ function checkHasCitationReadyContent(html) {
4833
5164
  }
4834
5165
  return null;
4835
5166
  }
5167
+ function checkDuplicateContentBlocks(html) {
5168
+ const { score, duplicates } = scoreDuplicateContentDetailed(html);
5169
+ if (score <= 6 && duplicates.length > 0) {
5170
+ const first = duplicates[0];
5171
+ const label = duplicates.length === 1 ? `Duplicate content: '${first.headingA}' and '${first.headingB}' share ${first.similarity}% similar text ("${first.sample}...")` : `${duplicates.length} duplicate blocks found (e.g. '${first.headingA}' and '${first.headingB}' \u2014 "${first.sample}...")`;
5172
+ return { check: "duplicate-content", label, severity: score <= 3 ? "error" : "warning" };
5173
+ }
5174
+ return null;
5175
+ }
4836
5176
  function analyzePage(html, url, category) {
4837
5177
  const title = extractTitle(html);
4838
5178
  const textContent = getTextContent2(html);
@@ -4851,7 +5191,8 @@ function analyzePage(html, url, category) {
4851
5191
  checkImagesMissingAlt(html),
4852
5192
  checkNoInternalLinks(html, url),
4853
5193
  checkNoAnswerBlock(html),
4854
- checkNoEvidence(html, url)
5194
+ checkNoEvidence(html, url),
5195
+ checkDuplicateContentBlocks(html)
4855
5196
  ];
4856
5197
  for (const result of issueChecks) {
4857
5198
  if (result) issues.push(result);
@@ -5217,9 +5558,9 @@ var CRITERION_WEIGHTS2 = {
5217
5558
  content_depth: 0.07,
5218
5559
  fact_density: 0.06,
5219
5560
  direct_answer_density: 0.05,
5220
- qa_content_format: 0.05,
5221
- query_answer_alignment: 0.05,
5222
- faq_section: 0.04,
5561
+ qa_content_format: 0.04,
5562
+ query_answer_alignment: 0.04,
5563
+ faq_section: 0.03,
5223
5564
  // Content Organization (~30%)
5224
5565
  entity_consistency: 0.05,
5225
5566
  internal_linking: 0.04,
@@ -5233,6 +5574,8 @@ var CRITERION_WEIGHTS2 = {
5233
5574
  clean_html: 0.02,
5234
5575
  // Technical Plumbing (~15%)
5235
5576
  content_cannibalization: 0.02,
5577
+ duplicate_content: 0.05,
5578
+ cross_page_duplication: 0.03,
5236
5579
  llms_txt: 0.02,
5237
5580
  robots_txt: 0.02,
5238
5581
  content_velocity: 0.02,
@@ -5277,7 +5620,9 @@ var PHASE_CONFIG = [
5277
5620
  "citation_ready_writing",
5278
5621
  "answer_first_placement",
5279
5622
  "evidence_packaging",
5280
- "entity_disambiguation"
5623
+ "entity_disambiguation",
5624
+ "duplicate_content",
5625
+ "cross_page_duplication"
5281
5626
  ]
5282
5627
  },
5283
5628
  {
@@ -6157,6 +6502,66 @@ Summarization: yes`,
6157
6502
  }
6158
6503
  return fixes;
6159
6504
  },
6505
+ duplicate_content: (c, pages) => {
6506
+ if (c.score >= 10) return [];
6507
+ const impact = impactFromScore(c.score);
6508
+ const effort = effortForCriterion("duplicate_content", c.score);
6509
+ const affected = getAffectedPages("duplicate_content", pages);
6510
+ const sectionPairs = c.findings.filter((f) => f.detail.includes("' and '")).map((f) => {
6511
+ const match = f.detail.match(/'([^']+)' and '([^']+)'/);
6512
+ return match ? { a: match[1], b: match[2] } : null;
6513
+ }).filter(Boolean);
6514
+ const steps = [
6515
+ "Identify sections with duplicate or near-identical text",
6516
+ "Rewrite each section to provide a unique angle on the topic",
6517
+ "Ensure each heading section adds new information for the reader"
6518
+ ];
6519
+ if (sectionPairs.length > 0) {
6520
+ const pair = sectionPairs[0];
6521
+ steps.unshift(`Start with '${pair.a}' and '${pair.b}' which share similar text`);
6522
+ }
6523
+ return [{
6524
+ id: "fix-duplicate-content",
6525
+ criterion: c.criterion_label,
6526
+ criterionId: c.criterion,
6527
+ title: "Fix duplicate content blocks",
6528
+ description: "Sections within pages contain identical or near-identical text. LLMs may flag this as low-quality content, reducing the authority of the page.",
6529
+ impact,
6530
+ effort,
6531
+ impactScore: 0,
6532
+ category: "content",
6533
+ steps,
6534
+ successCriteria: "Each section within a page provides unique content",
6535
+ affectedPages: affected,
6536
+ pageCount: affected?.length
6537
+ }];
6538
+ },
6539
+ cross_page_duplication: (c, pages) => {
6540
+ if (c.score >= 10) return [];
6541
+ const impact = impactFromScore(c.score);
6542
+ const effort = effortForCriterion("cross_page_duplication", c.score);
6543
+ const affected = getAffectedPages("cross_page_duplication", pages);
6544
+ return [{
6545
+ id: "fix-cross-page-duplication",
6546
+ criterion: c.criterion_label,
6547
+ criterionId: c.criterion,
6548
+ title: "Eliminate cross-page duplicate content",
6549
+ description: "The same paragraphs appear on multiple pages. AI engines may only index one version, wasting the others.",
6550
+ impact,
6551
+ effort,
6552
+ impactScore: 0,
6553
+ category: "content",
6554
+ steps: [
6555
+ "Identify paragraphs that are copy-pasted across multiple pages",
6556
+ "Rewrite each instance to provide a unique angle relevant to that page",
6557
+ "Move truly shared content to a single resource page and link to it",
6558
+ "Use canonical tags if pages must share content"
6559
+ ],
6560
+ successCriteria: "Each page has unique body content with no copy-pasted paragraphs",
6561
+ affectedPages: affected,
6562
+ pageCount: affected?.length
6563
+ }];
6564
+ },
6160
6565
  visible_date_signal: (c, pages) => {
6161
6566
  if (c.score >= 10) return [];
6162
6567
  const impact = impactFromScore(c.score);