aeorank 3.0.3 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/browser.js CHANGED
@@ -78,6 +78,75 @@ function detectParkedDomain(bodySnippet) {
78
78
  return { isParked: false };
79
79
  }
80
80
 
81
+ // src/duplicate-content.ts
82
+ var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
83
+ var MIN_SUBSTANTIVE_WORDS = 15;
84
+ var MAX_METADATA_WORDS = 24;
85
+ var MAX_METADATA_LABEL_WORDS = 4;
86
+ function normalizeParagraphText(htmlFragment) {
87
+ return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
88
+ }
89
+ function tokenize(text) {
90
+ return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
91
+ }
92
+ function isBoilerplateParagraph(text, words) {
93
+ if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
94
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
95
+ return false;
96
+ }
97
+ function isMetadataParagraph(text, words) {
98
+ const labelMatch = text.match(/^([^:]{1,60}):\s+/);
99
+ if (!labelMatch) return false;
100
+ const labelWords = tokenize(labelMatch[1]).length;
101
+ return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
102
+ }
103
+ function buildShinglesFromTokens(words, n = 4) {
104
+ const shingles = /* @__PURE__ */ new Set();
105
+ for (let i = 0; i <= words.length - n; i++) {
106
+ shingles.add(words.slice(i, i + n).join(" "));
107
+ }
108
+ return shingles;
109
+ }
110
+ function createParagraph(htmlFragment) {
111
+ const text = normalizeParagraphText(htmlFragment);
112
+ const words = tokenize(text);
113
+ if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
114
+ if (isBoilerplateParagraph(text, words.length)) return null;
115
+ if (isMetadataParagraph(text, words.length)) return null;
116
+ const shingles = buildShinglesFromTokens(words);
117
+ if (shingles.size < 3) return null;
118
+ return { text, shingles };
119
+ }
120
+ function stripNonContentHtml(html) {
121
+ return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
122
+ }
123
+ function extractDuplicateContentParagraphs(html) {
124
+ const cleaned = stripNonContentHtml(html);
125
+ const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
126
+ return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
127
+ }
128
+ function extractDuplicateContentSections(html) {
129
+ const cleaned = stripNonContentHtml(html);
130
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
131
+ const sections = [];
132
+ for (const part of parts) {
133
+ const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
134
+ const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
135
+ const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
136
+ if (paragraphs.length > 0) sections.push({ heading, paragraphs });
137
+ }
138
+ return sections;
139
+ }
140
+ function shingleJaccardSimilarity(a, b) {
141
+ if (a.size === 0 && b.size === 0) return 0;
142
+ let intersection = 0;
143
+ for (const shingle of a) {
144
+ if (b.has(shingle)) intersection++;
145
+ }
146
+ const union = a.size + b.size - intersection;
147
+ return union === 0 ? 0 : intersection / union;
148
+ }
149
+
81
150
  // src/site-crawler.ts
82
151
  async function fetchText(url) {
83
152
  try {
@@ -2558,6 +2627,186 @@ function checkImageContextAI(data) {
2558
2627
  }
2559
2628
  return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
2560
2629
  }
2630
+ function findIntraPageDuplicates(html) {
2631
+ const sections = extractDuplicateContentSections(html);
2632
+ if (sections.length < 2) return [];
2633
+ const pairs = [];
2634
+ for (let i = 0; i < sections.length; i++) {
2635
+ for (let j = i + 1; j < sections.length; j++) {
2636
+ let found = false;
2637
+ for (const pA of sections[i].paragraphs) {
2638
+ if (found) break;
2639
+ for (const pB of sections[j].paragraphs) {
2640
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2641
+ if (sim > 0.4) {
2642
+ pairs.push({
2643
+ headingA: sections[i].heading,
2644
+ headingB: sections[j].heading,
2645
+ similarity: Math.round(sim * 100),
2646
+ sample: pA.text.slice(0, 80)
2647
+ });
2648
+ found = true;
2649
+ break;
2650
+ }
2651
+ }
2652
+ }
2653
+ }
2654
+ }
2655
+ return pairs;
2656
+ }
2657
+ function checkDuplicateContent(data) {
2658
+ const findings = [];
2659
+ const pages = [];
2660
+ if (data.homepage) {
2661
+ pages.push({ html: data.homepage.text, url: data.homepage.finalUrl || `https://${data.domain}/` });
2662
+ }
2663
+ if (data.blogSample) {
2664
+ for (const page of data.blogSample) {
2665
+ pages.push({ html: page.text, url: page.finalUrl || "" });
2666
+ }
2667
+ }
2668
+ if (pages.length === 0) {
2669
+ findings.push({ severity: "critical", detail: "No pages available for duplicate content analysis" });
2670
+ return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score: 0, status: "fail", findings, fix_priority: "P1" };
2671
+ }
2672
+ let totalDupPages = 0;
2673
+ let totalDupPairs = 0;
2674
+ const dupDetails = [];
2675
+ for (const page of pages) {
2676
+ const pairs = findIntraPageDuplicates(page.html);
2677
+ if (pairs.length > 0) {
2678
+ totalDupPages++;
2679
+ totalDupPairs += pairs.length;
2680
+ dupDetails.push({ url: page.url, pairs });
2681
+ }
2682
+ }
2683
+ const dupRatio = totalDupPages / pages.length;
2684
+ let score;
2685
+ if (totalDupPairs === 0) {
2686
+ score = 10;
2687
+ findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no duplicate content blocks detected` });
2688
+ } else if (dupRatio <= 0.05 && totalDupPairs <= 2) {
2689
+ score = 9;
2690
+ findings.push({ severity: "info", detail: `${totalDupPairs} duplicate block pair(s) on ${totalDupPages} page(s) - minor` });
2691
+ } else if (dupRatio <= 0.1) {
2692
+ score = 7;
2693
+ findings.push({ severity: "low", detail: `${totalDupPairs} duplicate block pair(s) across ${totalDupPages} page(s)`, fix: "Rewrite duplicate sections to provide unique content in each" });
2694
+ } else if (dupRatio <= 0.2) {
2695
+ score = 5;
2696
+ findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Rewrite or remove repeated text blocks - LLMs may flag this as low-quality content" });
2697
+ } else if (dupRatio <= 0.4) {
2698
+ score = 3;
2699
+ findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) have significant duplicate content`, fix: "Widespread duplicate blocks reduce content authority - rewrite each section with unique angles" });
2700
+ } else {
2701
+ score = 0;
2702
+ findings.push({ severity: "high", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Severe content duplication across the site - LLMs will likely reduce citation authority" });
2703
+ }
2704
+ for (const dup of dupDetails.slice(0, 3)) {
2705
+ const shortUrl = dup.url.slice(0, 60);
2706
+ for (const pair of dup.pairs.slice(0, 2)) {
2707
+ findings.push({
2708
+ severity: "low",
2709
+ detail: `${shortUrl}: '${pair.headingA}' and '${pair.headingB}' share ${pair.similarity}% similar text ("${pair.sample}...")`,
2710
+ fix: `Rewrite one of these sections to eliminate duplicate content`
2711
+ });
2712
+ }
2713
+ }
2714
+ return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
2715
+ }
2716
+ function checkCrossPageDuplication(data) {
2717
+ const findings = [];
2718
+ const pages = [];
2719
+ if (data.homepage) {
2720
+ pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
2721
+ }
2722
+ if (data.blogSample) {
2723
+ for (const page of data.blogSample) {
2724
+ pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
2725
+ }
2726
+ }
2727
+ if (pages.length <= 1) {
2728
+ findings.push({ severity: "info", detail: "Not enough pages to assess cross-page duplication" });
2729
+ return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score: 5, status: "partial", findings, fix_priority: "P3" };
2730
+ }
2731
+ const paragraphPageCount = /* @__PURE__ */ new Map();
2732
+ for (const page of pages) {
2733
+ const seen = /* @__PURE__ */ new Set();
2734
+ for (const p of page.paragraphs) {
2735
+ const fp = [...p.shingles].slice(0, 5).join("|");
2736
+ if (!seen.has(fp)) {
2737
+ seen.add(fp);
2738
+ paragraphPageCount.set(fp, (paragraphPageCount.get(fp) || 0) + 1);
2739
+ }
2740
+ }
2741
+ }
2742
+ const boilerplateThreshold = Math.max(3, pages.length * 0.4);
2743
+ const siteBoilerprints = /* @__PURE__ */ new Set();
2744
+ for (const [fp, count] of paragraphPageCount) {
2745
+ if (count >= boilerplateThreshold) siteBoilerprints.add(fp);
2746
+ }
2747
+ const crossDupPairs = [];
2748
+ for (let i = 0; i < pages.length; i++) {
2749
+ for (let j = i + 1; j < pages.length; j++) {
2750
+ let dupCount = 0;
2751
+ let sample = "";
2752
+ for (const pA of pages[i].paragraphs) {
2753
+ const fpA = [...pA.shingles].slice(0, 5).join("|");
2754
+ if (siteBoilerprints.has(fpA)) continue;
2755
+ for (const pB of pages[j].paragraphs) {
2756
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2757
+ if (sim > 0.4) {
2758
+ dupCount++;
2759
+ if (!sample) sample = pA.text.slice(0, 80);
2760
+ break;
2761
+ }
2762
+ }
2763
+ }
2764
+ if (dupCount >= 2) {
2765
+ crossDupPairs.push({
2766
+ urlA: pages[i].url.slice(0, 60),
2767
+ urlB: pages[j].url.slice(0, 60),
2768
+ dupCount,
2769
+ sample
2770
+ });
2771
+ }
2772
+ }
2773
+ }
2774
+ const affectedUrls = /* @__PURE__ */ new Set();
2775
+ for (const pair of crossDupPairs) {
2776
+ affectedUrls.add(pair.urlA);
2777
+ affectedUrls.add(pair.urlB);
2778
+ }
2779
+ const affectedRatio = pages.length > 0 ? affectedUrls.size / pages.length : 0;
2780
+ const totalDupParagraphs = crossDupPairs.reduce((s, p) => s + p.dupCount, 0);
2781
+ let score;
2782
+ if (crossDupPairs.length === 0) {
2783
+ score = 10;
2784
+ findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no cross-page content duplication detected` });
2785
+ } else if (affectedRatio <= 0.05 && totalDupParagraphs <= 4) {
2786
+ score = 9;
2787
+ findings.push({ severity: "info", detail: `${totalDupParagraphs} shared paragraph(s) across ${affectedUrls.size} page(s) - minor` });
2788
+ } else if (affectedRatio <= 0.1) {
2789
+ score = 7;
2790
+ findings.push({ severity: "low", detail: `${totalDupParagraphs} shared paragraphs across ${affectedUrls.size} pages`, fix: "Rewrite shared content so each page provides a unique perspective" });
2791
+ } else if (affectedRatio <= 0.2) {
2792
+ score = 5;
2793
+ findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate paragraphs`, fix: "Significant cross-page duplication - AI engines may only index one version" });
2794
+ } else if (affectedRatio <= 0.4) {
2795
+ score = 3;
2796
+ findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) contain shared content blocks`, fix: "Widespread copy-paste content across pages reduces overall site authority" });
2797
+ } else {
2798
+ score = 0;
2799
+ findings.push({ severity: "high", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate content`, fix: "Severe cross-page duplication - AI engines will likely ignore redundant pages entirely" });
2800
+ }
2801
+ for (const pair of crossDupPairs.slice(0, 3)) {
2802
+ findings.push({
2803
+ severity: "low",
2804
+ detail: `${pair.dupCount} shared paragraph(s): ${pair.urlA} \u2194 ${pair.urlB} ("${pair.sample}...")`,
2805
+ fix: "Rewrite shared paragraphs so each page has unique content"
2806
+ });
2807
+ }
2808
+ return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
2809
+ }
2561
2810
  function auditSiteFromData(data) {
2562
2811
  const topicCoherence = checkTopicCoherence(data);
2563
2812
  const cannibalization = checkContentCannibalization(data, topicCoherence.score);
@@ -2596,7 +2845,10 @@ function auditSiteFromData(data) {
2596
2845
  checkEvidencePackaging(data),
2597
2846
  checkEntityDisambiguation(data),
2598
2847
  checkExtractionFriction(data),
2599
- checkImageContextAI(data)
2848
+ checkImageContextAI(data),
2849
+ // V3 criteria (#35-#36)
2850
+ checkDuplicateContent(data),
2851
+ checkCrossPageDuplication(data)
2600
2852
  ];
2601
2853
  }
2602
2854
  async function auditSite(targetUrl) {
@@ -2620,11 +2872,11 @@ var WEIGHTS = {
2620
2872
  // Information density per page
2621
2873
  direct_answer_density: 0.05,
2622
2874
  // Direct answers to queries
2623
- qa_content_format: 0.05,
2875
+ qa_content_format: 0.04,
2624
2876
  // Answer-shaped content structure
2625
- query_answer_alignment: 0.05,
2877
+ query_answer_alignment: 0.04,
2626
2878
  // Relevance to actual AI queries
2627
- faq_section: 0.04,
2879
+ faq_section: 0.03,
2628
2880
  // Structured Q&A pairs
2629
2881
  // ─── Content Organization (~30%) ──────────────────────────────────────────
2630
2882
  // HOW easily AI engines can extract and trust your content.
@@ -2672,8 +2924,13 @@ var WEIGHTS = {
2672
2924
  // Clear entity boundaries
2673
2925
  extraction_friction: 0.02,
2674
2926
  // Sentence length, voice, jargon
2675
- image_context_ai: 0.01
2927
+ image_context_ai: 0.01,
2676
2928
  // Figure/figcaption, alt text quality
2929
+ // ─── V3 Criteria ────────────────────────────────────────────────────────
2930
+ duplicate_content: 0.05,
2931
+ // Duplicate text blocks within pages
2932
+ cross_page_duplication: 0.03
2933
+ // Same paragraphs copied across pages
2677
2934
  };
2678
2935
  function calculateOverallScore(criteria) {
2679
2936
  let totalWeight = 0;
@@ -2702,7 +2959,9 @@ var PILLARS = {
2702
2959
  "fact_density",
2703
2960
  "citation_ready_writing",
2704
2961
  "answer_first_placement",
2705
- "evidence_packaging"
2962
+ "evidence_packaging",
2963
+ "duplicate_content",
2964
+ "cross_page_duplication"
2706
2965
  ],
2707
2966
  "Content Structure": [
2708
2967
  "direct_answer_density",
@@ -2767,6 +3026,8 @@ var CLIENT_NAMES = {
2767
3026
  image_context_ai: "Image Context for AI",
2768
3027
  schema_coverage: "Schema Coverage",
2769
3028
  speakable_schema: "Speakable Schema",
3029
+ duplicate_content: "Duplicate Content Blocks",
3030
+ cross_page_duplication: "Cross-Page Duplicate Content",
2770
3031
  content_cannibalization: "Content Cannibalization",
2771
3032
  llms_txt: "llms.txt File",
2772
3033
  robots_txt: "robots.txt for AI",
@@ -2784,10 +3045,12 @@ var PILLAR_WEIGHTS = {
2784
3045
  citation_ready_writing: 0.04,
2785
3046
  answer_first_placement: 0.03,
2786
3047
  evidence_packaging: 0.03,
3048
+ duplicate_content: 0.05,
3049
+ cross_page_duplication: 0.03,
2787
3050
  direct_answer_density: 0.05,
2788
- qa_content_format: 0.05,
2789
- query_answer_alignment: 0.05,
2790
- faq_section: 0.04,
3051
+ qa_content_format: 0.04,
3052
+ query_answer_alignment: 0.04,
3053
+ faq_section: 0.03,
2791
3054
  table_list_extractability: 0.03,
2792
3055
  definition_patterns: 0.02,
2793
3056
  entity_disambiguation: 0.02,
@@ -2820,6 +3083,8 @@ var CRITERION_EFFORT = {
2820
3083
  citation_ready_writing: "Medium",
2821
3084
  answer_first_placement: "Medium",
2822
3085
  evidence_packaging: "Medium",
3086
+ duplicate_content: "Medium",
3087
+ cross_page_duplication: "Medium",
2823
3088
  direct_answer_density: "Medium",
2824
3089
  qa_content_format: "Medium",
2825
3090
  query_answer_alignment: "Medium",
@@ -2875,6 +3140,8 @@ var FIX_DESCRIPTIONS = {
2875
3140
  image_context_ai: "Wrap images in <figure>/<figcaption> with descriptive alt text.",
2876
3141
  schema_coverage: "Extend structured data to inner pages (articles, services, products).",
2877
3142
  speakable_schema: "Add SpeakableSpecification schema for voice assistant compatibility.",
3143
+ duplicate_content: "Rewrite duplicate text blocks so each section provides unique value.",
3144
+ cross_page_duplication: "Rewrite shared paragraphs across pages so each page has unique content.",
2878
3145
  content_cannibalization: "Consolidate overlapping pages or differentiate titles and H1 headings.",
2879
3146
  llms_txt: "Create a /llms.txt file describing your site for AI engines.",
2880
3147
  robots_txt: "Update robots.txt to explicitly allow AI crawlers.",
@@ -2970,7 +3237,9 @@ var CRITERION_LABELS = {
2970
3237
  "Evidence Packaging": "Evidence Packaging",
2971
3238
  "Entity Disambiguation": "Entity Disambiguation",
2972
3239
  "Extraction Friction Score": "Extraction Friction Score",
2973
- "Image Context for AI": "Image Context for AI"
3240
+ "Image Context for AI": "Image Context for AI",
3241
+ "Duplicate Content Blocks": "Duplicate Content Blocks",
3242
+ "Cross-Page Duplicate Content": "Cross-Page Duplicate Content"
2974
3243
  };
2975
3244
  function scoreToStatus(score) {
2976
3245
  if (score === 0) return "MISSING";
@@ -3065,9 +3334,9 @@ var CRITERION_WEIGHTS = {
3065
3334
  content_depth: 0.07,
3066
3335
  fact_density: 0.06,
3067
3336
  direct_answer_density: 0.05,
3068
- qa_content_format: 0.05,
3069
- query_answer_alignment: 0.05,
3070
- faq_section: 0.04,
3337
+ qa_content_format: 0.04,
3338
+ query_answer_alignment: 0.04,
3339
+ faq_section: 0.03,
3071
3340
  // Content Organization (~30%)
3072
3341
  entity_consistency: 0.05,
3073
3342
  internal_linking: 0.04,
@@ -3096,7 +3365,10 @@ var CRITERION_WEIGHTS = {
3096
3365
  evidence_packaging: 0.03,
3097
3366
  entity_disambiguation: 0.02,
3098
3367
  extraction_friction: 0.02,
3099
- image_context_ai: 0.01
3368
+ image_context_ai: 0.01,
3369
+ // V3 Criteria
3370
+ duplicate_content: 0.05,
3371
+ cross_page_duplication: 0.03
3100
3372
  };
3101
3373
  var OPPORTUNITY_TEMPLATES = {
3102
3374
  llms_txt: {
@@ -3219,6 +3491,16 @@ var OPPORTUNITY_TEMPLATES = {
3219
3491
  effort: "Medium",
3220
3492
  description: "Ensure every question-format heading (H2/H3) is followed by a direct answer paragraph. This pattern is ideal for AI engine snippet extraction."
3221
3493
  },
3494
+ duplicate_content: {
3495
+ name: "Fix Duplicate Content Blocks",
3496
+ effort: "Medium",
3497
+ description: "Sections within pages contain identical or near-identical text. LLMs may flag this as low-quality or thin content, reducing citation authority. Rewrite duplicate blocks with unique angles."
3498
+ },
3499
+ cross_page_duplication: {
3500
+ name: "Eliminate Cross-Page Duplicate Content",
3501
+ effort: "Medium",
3502
+ description: "The same paragraphs appear on multiple pages. AI engines may only index one version and ignore the rest. Rewrite shared content so each page offers a unique perspective."
3503
+ },
3222
3504
  content_cannibalization: {
3223
3505
  name: "Resolve Content Cannibalization",
3224
3506
  effort: "Medium",
@@ -3627,9 +3909,9 @@ var PAGE_CRITERIA = {
3627
3909
  original_data: { weight: 0.1, label: "Original Data & Expert Content" },
3628
3910
  fact_density: { weight: 0.06, label: "Fact & Data Density" },
3629
3911
  direct_answer_density: { weight: 0.05, label: "Direct Answer Paragraphs" },
3630
- qa_content_format: { weight: 0.05, label: "Q&A Content Format" },
3631
- query_answer_alignment: { weight: 0.05, label: "Query-Answer Alignment" },
3632
- faq_section: { weight: 0.04, label: "FAQ Section Content" },
3912
+ qa_content_format: { weight: 0.04, label: "Q&A Content Format" },
3913
+ query_answer_alignment: { weight: 0.04, label: "Query-Answer Alignment" },
3914
+ faq_section: { weight: 0.03, label: "FAQ Section Content" },
3633
3915
  // Content Organization
3634
3916
  content_freshness: { weight: 0.04, label: "Content Freshness Signals" },
3635
3917
  schema_markup: { weight: 0.03, label: "Schema.org Structured Data" },
@@ -3646,7 +3928,8 @@ var PAGE_CRITERIA = {
3646
3928
  evidence_packaging: { weight: 0.03, label: "Evidence Packaging" },
3647
3929
  entity_disambiguation: { weight: 0.02, label: "Entity Disambiguation" },
3648
3930
  extraction_friction: { weight: 0.02, label: "Extraction Friction Score" },
3649
- image_context_ai: { weight: 0.01, label: "Image Context for AI" }
3931
+ image_context_ai: { weight: 0.01, label: "Image Context for AI" },
3932
+ duplicate_content: { weight: 0.05, label: "Duplicate Content Blocks" }
3650
3933
  };
3651
3934
  function extractJsonLdBlocks(html) {
3652
3935
  const blocks = [];
@@ -4095,6 +4378,48 @@ function scoreImageContextAI(html) {
4095
4378
  if (contextualImages.length > 0) score += 3;
4096
4379
  return cap(score, 10);
4097
4380
  }
4381
+ function scoreDuplicateContent(html) {
4382
+ return scoreDuplicateContentDetailed(html).score;
4383
+ }
4384
+ function scoreDuplicateContentDetailed(html) {
4385
+ const sections = extractDuplicateContentSections(html);
4386
+ if (sections.length < 2) return { score: 10, duplicates: [] };
4387
+ const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
4388
+ const duplicates = [];
4389
+ let dupParagraphCount = 0;
4390
+ for (let i = 0; i < sections.length; i++) {
4391
+ for (let j = i + 1; j < sections.length; j++) {
4392
+ for (const pA of sections[i].paragraphs) {
4393
+ for (const pB of sections[j].paragraphs) {
4394
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
4395
+ if (sim > 0.4) {
4396
+ dupParagraphCount++;
4397
+ duplicates.push({
4398
+ headingA: sections[i].heading,
4399
+ headingB: sections[j].heading,
4400
+ similarity: Math.round(sim * 100),
4401
+ sample: pA.text.slice(0, 80)
4402
+ });
4403
+ break;
4404
+ }
4405
+ }
4406
+ }
4407
+ }
4408
+ }
4409
+ if (dupParagraphCount === 0) return { score: 10, duplicates: [] };
4410
+ const dupRatio = totalParagraphs > 0 ? dupParagraphCount / totalParagraphs : 0;
4411
+ let score;
4412
+ if (dupParagraphCount === 1 && dupRatio <= 0.05) {
4413
+ score = 6;
4414
+ } else if (dupParagraphCount === 1) {
4415
+ score = 4;
4416
+ } else if (dupParagraphCount === 2) {
4417
+ score = 2;
4418
+ } else {
4419
+ score = 0;
4420
+ }
4421
+ return { score, duplicates };
4422
+ }
4098
4423
  var SCORING_FUNCTIONS = {
4099
4424
  schema_markup: scoreSchemaMarkup,
4100
4425
  qa_content_format: scoreQAFormat,
@@ -4115,7 +4440,8 @@ var SCORING_FUNCTIONS = {
4115
4440
  evidence_packaging: scoreEvidencePackaging,
4116
4441
  entity_disambiguation: scoreEntityDisambiguation,
4117
4442
  extraction_friction: scoreExtractionFriction,
4118
- image_context_ai: scoreImageContextAI
4443
+ image_context_ai: scoreImageContextAI,
4444
+ duplicate_content: scoreDuplicateContent
4119
4445
  };
4120
4446
  function scorePage(html, url) {
4121
4447
  let totalWeight = 0;
@@ -4129,6 +4455,11 @@ function scorePage(html, url) {
4129
4455
  totalWeight += weight;
4130
4456
  }
4131
4457
  let aeoScore = totalWeight === 0 ? 0 : Math.round(weightedSum / totalWeight);
4458
+ const dupScore = criterionScores.find((c) => c.criterion === "duplicate_content")?.score ?? 10;
4459
+ if (dupScore <= 6) {
4460
+ const dupCap = 35 + dupScore * 5;
4461
+ aeoScore = Math.min(aeoScore, dupCap);
4462
+ }
4132
4463
  const scoreCapped = aeoScore > 75;
4133
4464
  if (scoreCapped) aeoScore = 75;
4134
4465
  return { aeoScore, criterionScores, scoreCapped };
@@ -4348,6 +4679,15 @@ function checkHasCitationReadyContent(html) {
4348
4679
  }
4349
4680
  return null;
4350
4681
  }
4682
+ function checkDuplicateContentBlocks(html) {
4683
+ const { score, duplicates } = scoreDuplicateContentDetailed(html);
4684
+ if (score <= 6 && duplicates.length > 0) {
4685
+ const first = duplicates[0];
4686
+ const label = duplicates.length === 1 ? `Duplicate content: '${first.headingA}' and '${first.headingB}' share ${first.similarity}% similar text ("${first.sample}...")` : `${duplicates.length} duplicate blocks found (e.g. '${first.headingA}' and '${first.headingB}' \u2014 "${first.sample}...")`;
4687
+ return { check: "duplicate-content", label, severity: score <= 3 ? "error" : "warning" };
4688
+ }
4689
+ return null;
4690
+ }
4351
4691
  function analyzePage(html, url, category) {
4352
4692
  const title = extractTitle(html);
4353
4693
  const textContent = getTextContent2(html);
@@ -4366,7 +4706,8 @@ function analyzePage(html, url, category) {
4366
4706
  checkImagesMissingAlt(html),
4367
4707
  checkNoInternalLinks(html, url),
4368
4708
  checkNoAnswerBlock(html),
4369
- checkNoEvidence(html, url)
4709
+ checkNoEvidence(html, url),
4710
+ checkDuplicateContentBlocks(html)
4370
4711
  ];
4371
4712
  for (const result of issueChecks) {
4372
4713
  if (result) issues.push(result);
@@ -4639,9 +4980,9 @@ var CRITERION_WEIGHTS2 = {
4639
4980
  content_depth: 0.07,
4640
4981
  fact_density: 0.06,
4641
4982
  direct_answer_density: 0.05,
4642
- qa_content_format: 0.05,
4643
- query_answer_alignment: 0.05,
4644
- faq_section: 0.04,
4983
+ qa_content_format: 0.04,
4984
+ query_answer_alignment: 0.04,
4985
+ faq_section: 0.03,
4645
4986
  // Content Organization (~30%)
4646
4987
  entity_consistency: 0.05,
4647
4988
  internal_linking: 0.04,
@@ -4655,6 +4996,8 @@ var CRITERION_WEIGHTS2 = {
4655
4996
  clean_html: 0.02,
4656
4997
  // Technical Plumbing (~15%)
4657
4998
  content_cannibalization: 0.02,
4999
+ duplicate_content: 0.05,
5000
+ cross_page_duplication: 0.03,
4658
5001
  llms_txt: 0.02,
4659
5002
  robots_txt: 0.02,
4660
5003
  content_velocity: 0.02,
@@ -4699,7 +5042,9 @@ var PHASE_CONFIG = [
4699
5042
  "citation_ready_writing",
4700
5043
  "answer_first_placement",
4701
5044
  "evidence_packaging",
4702
- "entity_disambiguation"
5045
+ "entity_disambiguation",
5046
+ "duplicate_content",
5047
+ "cross_page_duplication"
4703
5048
  ]
4704
5049
  },
4705
5050
  {
@@ -5579,6 +5924,66 @@ Summarization: yes`,
5579
5924
  }
5580
5925
  return fixes;
5581
5926
  },
5927
+ duplicate_content: (c, pages) => {
5928
+ if (c.score >= 10) return [];
5929
+ const impact = impactFromScore(c.score);
5930
+ const effort = effortForCriterion("duplicate_content", c.score);
5931
+ const affected = getAffectedPages("duplicate_content", pages);
5932
+ const sectionPairs = c.findings.filter((f) => f.detail.includes("' and '")).map((f) => {
5933
+ const match = f.detail.match(/'([^']+)' and '([^']+)'/);
5934
+ return match ? { a: match[1], b: match[2] } : null;
5935
+ }).filter(Boolean);
5936
+ const steps = [
5937
+ "Identify sections with duplicate or near-identical text",
5938
+ "Rewrite each section to provide a unique angle on the topic",
5939
+ "Ensure each heading section adds new information for the reader"
5940
+ ];
5941
+ if (sectionPairs.length > 0) {
5942
+ const pair = sectionPairs[0];
5943
+ steps.unshift(`Start with '${pair.a}' and '${pair.b}' which share similar text`);
5944
+ }
5945
+ return [{
5946
+ id: "fix-duplicate-content",
5947
+ criterion: c.criterion_label,
5948
+ criterionId: c.criterion,
5949
+ title: "Fix duplicate content blocks",
5950
+ description: "Sections within pages contain identical or near-identical text. LLMs may flag this as low-quality content, reducing the authority of the page.",
5951
+ impact,
5952
+ effort,
5953
+ impactScore: 0,
5954
+ category: "content",
5955
+ steps,
5956
+ successCriteria: "Each section within a page provides unique content",
5957
+ affectedPages: affected,
5958
+ pageCount: affected?.length
5959
+ }];
5960
+ },
5961
+ cross_page_duplication: (c, pages) => {
5962
+ if (c.score >= 10) return [];
5963
+ const impact = impactFromScore(c.score);
5964
+ const effort = effortForCriterion("cross_page_duplication", c.score);
5965
+ const affected = getAffectedPages("cross_page_duplication", pages);
5966
+ return [{
5967
+ id: "fix-cross-page-duplication",
5968
+ criterion: c.criterion_label,
5969
+ criterionId: c.criterion,
5970
+ title: "Eliminate cross-page duplicate content",
5971
+ description: "The same paragraphs appear on multiple pages. AI engines may only index one version, wasting the others.",
5972
+ impact,
5973
+ effort,
5974
+ impactScore: 0,
5975
+ category: "content",
5976
+ steps: [
5977
+ "Identify paragraphs that are copy-pasted across multiple pages",
5978
+ "Rewrite each instance to provide a unique angle relevant to that page",
5979
+ "Move truly shared content to a single resource page and link to it",
5980
+ "Use canonical tags if pages must share content"
5981
+ ],
5982
+ successCriteria: "Each page has unique body content with no copy-pasted paragraphs",
5983
+ affectedPages: affected,
5984
+ pageCount: affected?.length
5985
+ }];
5986
+ },
5582
5987
  visible_date_signal: (c, pages) => {
5583
5988
  if (c.score >= 10) return [];
5584
5989
  const impact = impactFromScore(c.score);