aeorank 3.0.3 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -2556,6 +2556,234 @@ function checkImageContextAI(data) {
2556
2556
  }
2557
2557
  return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
2558
2558
  }
2559
+ var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
2560
+ function isBoilerplateParagraph(text) {
2561
+ const words = text.split(/\s+/).length;
2562
+ if (words < 20 && BOILERPLATE_RE.test(text)) return true;
2563
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
2564
+ return false;
2565
+ }
2566
+ function toShingles(text, n = 4) {
2567
+ const words = text.split(/\s+/).filter((w) => w.length > 1);
2568
+ const shingles = /* @__PURE__ */ new Set();
2569
+ for (let i = 0; i <= words.length - n; i++) {
2570
+ shingles.add(words.slice(i, i + n).join(" "));
2571
+ }
2572
+ return shingles;
2573
+ }
2574
+ function shingleSimilarity(a, b) {
2575
+ if (a.size === 0 && b.size === 0) return 0;
2576
+ let intersection = 0;
2577
+ for (const s of a) {
2578
+ if (b.has(s)) intersection++;
2579
+ }
2580
+ const union = a.size + b.size - intersection;
2581
+ return union === 0 ? 0 : intersection / union;
2582
+ }
2583
+ function extractPageParagraphs(html) {
2584
+ const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2585
+ const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2586
+ return pMatches.map((p) => {
2587
+ const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2588
+ return { text, shingles: toShingles(text) };
2589
+ }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2590
+ }
2591
+ function splitIntoSectionsWithParagraphs(html) {
2592
+ const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2593
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
2594
+ const sections = [];
2595
+ for (const part of parts) {
2596
+ const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
2597
+ const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
2598
+ const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2599
+ const paragraphs = pMatches.map((p) => {
2600
+ const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2601
+ return { text, shingles: toShingles(text) };
2602
+ }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2603
+ if (paragraphs.length > 0) sections.push({ heading, paragraphs });
2604
+ }
2605
+ return sections;
2606
+ }
2607
+ function findIntraPageDuplicates(html) {
2608
+ const sections = splitIntoSectionsWithParagraphs(html);
2609
+ if (sections.length < 2) return [];
2610
+ const pairs = [];
2611
+ for (let i = 0; i < sections.length; i++) {
2612
+ for (let j = i + 1; j < sections.length; j++) {
2613
+ let found = false;
2614
+ for (const pA of sections[i].paragraphs) {
2615
+ if (found) break;
2616
+ for (const pB of sections[j].paragraphs) {
2617
+ const sim = shingleSimilarity(pA.shingles, pB.shingles);
2618
+ if (sim > 0.4) {
2619
+ pairs.push({
2620
+ headingA: sections[i].heading,
2621
+ headingB: sections[j].heading,
2622
+ similarity: Math.round(sim * 100),
2623
+ sample: pA.text.slice(0, 80)
2624
+ });
2625
+ found = true;
2626
+ break;
2627
+ }
2628
+ }
2629
+ }
2630
+ }
2631
+ }
2632
+ return pairs;
2633
+ }
2634
+ function checkDuplicateContent(data) {
2635
+ const findings = [];
2636
+ const pages = [];
2637
+ if (data.homepage) {
2638
+ pages.push({ html: data.homepage.text, url: data.homepage.finalUrl || `https://${data.domain}/` });
2639
+ }
2640
+ if (data.blogSample) {
2641
+ for (const page of data.blogSample) {
2642
+ pages.push({ html: page.text, url: page.finalUrl || "" });
2643
+ }
2644
+ }
2645
+ if (pages.length === 0) {
2646
+ findings.push({ severity: "critical", detail: "No pages available for duplicate content analysis" });
2647
+ return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score: 0, status: "fail", findings, fix_priority: "P1" };
2648
+ }
2649
+ let totalDupPages = 0;
2650
+ let totalDupPairs = 0;
2651
+ const dupDetails = [];
2652
+ for (const page of pages) {
2653
+ const pairs = findIntraPageDuplicates(page.html);
2654
+ if (pairs.length > 0) {
2655
+ totalDupPages++;
2656
+ totalDupPairs += pairs.length;
2657
+ dupDetails.push({ url: page.url, pairs });
2658
+ }
2659
+ }
2660
+ const dupRatio = totalDupPages / pages.length;
2661
+ let score;
2662
+ if (totalDupPairs === 0) {
2663
+ score = 10;
2664
+ findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no duplicate content blocks detected` });
2665
+ } else if (dupRatio <= 0.05 && totalDupPairs <= 2) {
2666
+ score = 9;
2667
+ findings.push({ severity: "info", detail: `${totalDupPairs} duplicate block pair(s) on ${totalDupPages} page(s) - minor` });
2668
+ } else if (dupRatio <= 0.1) {
2669
+ score = 7;
2670
+ findings.push({ severity: "low", detail: `${totalDupPairs} duplicate block pair(s) across ${totalDupPages} page(s)`, fix: "Rewrite duplicate sections to provide unique content in each" });
2671
+ } else if (dupRatio <= 0.2) {
2672
+ score = 5;
2673
+ findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Rewrite or remove repeated text blocks - LLMs may flag this as low-quality content" });
2674
+ } else if (dupRatio <= 0.4) {
2675
+ score = 3;
2676
+ findings.push({ severity: "medium", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) have significant duplicate content`, fix: "Widespread duplicate blocks reduce content authority - rewrite each section with unique angles" });
2677
+ } else {
2678
+ score = 0;
2679
+ findings.push({ severity: "high", detail: `${totalDupPages} pages (${Math.round(dupRatio * 100)}%) contain duplicate content blocks`, fix: "Severe content duplication across the site - LLMs will likely reduce citation authority" });
2680
+ }
2681
+ for (const dup of dupDetails.slice(0, 3)) {
2682
+ const shortUrl = dup.url.slice(0, 60);
2683
+ for (const pair of dup.pairs.slice(0, 2)) {
2684
+ findings.push({
2685
+ severity: "low",
2686
+ detail: `${shortUrl}: '${pair.headingA}' and '${pair.headingB}' share ${pair.similarity}% similar text ("${pair.sample}...")`,
2687
+ fix: `Rewrite one of these sections to eliminate duplicate content`
2688
+ });
2689
+ }
2690
+ }
2691
+ return { criterion: "duplicate_content", criterion_label: "Duplicate Content Blocks", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
2692
+ }
2693
+ function checkCrossPageDuplication(data) {
2694
+ const findings = [];
2695
+ const pages = [];
2696
+ if (data.homepage) {
2697
+ pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractPageParagraphs(data.homepage.text) });
2698
+ }
2699
+ if (data.blogSample) {
2700
+ for (const page of data.blogSample) {
2701
+ pages.push({ url: page.finalUrl || "", paragraphs: extractPageParagraphs(page.text) });
2702
+ }
2703
+ }
2704
+ if (pages.length <= 1) {
2705
+ findings.push({ severity: "info", detail: "Not enough pages to assess cross-page duplication" });
2706
+ return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score: 5, status: "partial", findings, fix_priority: "P3" };
2707
+ }
2708
+ const paragraphPageCount = /* @__PURE__ */ new Map();
2709
+ for (const page of pages) {
2710
+ const seen = /* @__PURE__ */ new Set();
2711
+ for (const p of page.paragraphs) {
2712
+ const fp = [...p.shingles].slice(0, 5).join("|");
2713
+ if (!seen.has(fp)) {
2714
+ seen.add(fp);
2715
+ paragraphPageCount.set(fp, (paragraphPageCount.get(fp) || 0) + 1);
2716
+ }
2717
+ }
2718
+ }
2719
+ const boilerplateThreshold = Math.max(3, pages.length * 0.4);
2720
+ const siteBoilerprints = /* @__PURE__ */ new Set();
2721
+ for (const [fp, count] of paragraphPageCount) {
2722
+ if (count >= boilerplateThreshold) siteBoilerprints.add(fp);
2723
+ }
2724
+ const crossDupPairs = [];
2725
+ for (let i = 0; i < pages.length; i++) {
2726
+ for (let j = i + 1; j < pages.length; j++) {
2727
+ let dupCount = 0;
2728
+ let sample = "";
2729
+ for (const pA of pages[i].paragraphs) {
2730
+ const fpA = [...pA.shingles].slice(0, 5).join("|");
2731
+ if (siteBoilerprints.has(fpA)) continue;
2732
+ for (const pB of pages[j].paragraphs) {
2733
+ const sim = shingleSimilarity(pA.shingles, pB.shingles);
2734
+ if (sim > 0.4) {
2735
+ dupCount++;
2736
+ if (!sample) sample = pA.text.slice(0, 80);
2737
+ break;
2738
+ }
2739
+ }
2740
+ }
2741
+ if (dupCount >= 2) {
2742
+ crossDupPairs.push({
2743
+ urlA: pages[i].url.slice(0, 60),
2744
+ urlB: pages[j].url.slice(0, 60),
2745
+ dupCount,
2746
+ sample
2747
+ });
2748
+ }
2749
+ }
2750
+ }
2751
+ const affectedUrls = /* @__PURE__ */ new Set();
2752
+ for (const pair of crossDupPairs) {
2753
+ affectedUrls.add(pair.urlA);
2754
+ affectedUrls.add(pair.urlB);
2755
+ }
2756
+ const affectedRatio = pages.length > 0 ? affectedUrls.size / pages.length : 0;
2757
+ const totalDupParagraphs = crossDupPairs.reduce((s, p) => s + p.dupCount, 0);
2758
+ let score;
2759
+ if (crossDupPairs.length === 0) {
2760
+ score = 10;
2761
+ findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no cross-page content duplication detected` });
2762
+ } else if (affectedRatio <= 0.05 && totalDupParagraphs <= 4) {
2763
+ score = 9;
2764
+ findings.push({ severity: "info", detail: `${totalDupParagraphs} shared paragraph(s) across ${affectedUrls.size} page(s) - minor` });
2765
+ } else if (affectedRatio <= 0.1) {
2766
+ score = 7;
2767
+ findings.push({ severity: "low", detail: `${totalDupParagraphs} shared paragraphs across ${affectedUrls.size} pages`, fix: "Rewrite shared content so each page provides a unique perspective" });
2768
+ } else if (affectedRatio <= 0.2) {
2769
+ score = 5;
2770
+ findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate paragraphs`, fix: "Significant cross-page duplication - AI engines may only index one version" });
2771
+ } else if (affectedRatio <= 0.4) {
2772
+ score = 3;
2773
+ findings.push({ severity: "medium", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) contain shared content blocks`, fix: "Widespread copy-paste content across pages reduces overall site authority" });
2774
+ } else {
2775
+ score = 0;
2776
+ findings.push({ severity: "high", detail: `${affectedUrls.size} pages (${Math.round(affectedRatio * 100)}%) share duplicate content`, fix: "Severe cross-page duplication - AI engines will likely ignore redundant pages entirely" });
2777
+ }
2778
+ for (const pair of crossDupPairs.slice(0, 3)) {
2779
+ findings.push({
2780
+ severity: "low",
2781
+ detail: `${pair.dupCount} shared paragraph(s): ${pair.urlA} \u2194 ${pair.urlB} ("${pair.sample}...")`,
2782
+ fix: "Rewrite shared paragraphs so each page has unique content"
2783
+ });
2784
+ }
2785
+ return { criterion: "cross_page_duplication", criterion_label: "Cross-Page Duplicate Content", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
2786
+ }
2559
2787
  function auditSiteFromData(data) {
2560
2788
  const topicCoherence = checkTopicCoherence(data);
2561
2789
  const cannibalization = checkContentCannibalization(data, topicCoherence.score);
@@ -2594,7 +2822,10 @@ function auditSiteFromData(data) {
2594
2822
  checkEvidencePackaging(data),
2595
2823
  checkEntityDisambiguation(data),
2596
2824
  checkExtractionFriction(data),
2597
- checkImageContextAI(data)
2825
+ checkImageContextAI(data),
2826
+ // V3 criteria (#35-#36)
2827
+ checkDuplicateContent(data),
2828
+ checkCrossPageDuplication(data)
2598
2829
  ];
2599
2830
  }
2600
2831
 
@@ -2612,11 +2843,11 @@ var WEIGHTS = {
2612
2843
  // Information density per page
2613
2844
  direct_answer_density: 0.05,
2614
2845
  // Direct answers to queries
2615
- qa_content_format: 0.05,
2846
+ qa_content_format: 0.04,
2616
2847
  // Answer-shaped content structure
2617
- query_answer_alignment: 0.05,
2848
+ query_answer_alignment: 0.04,
2618
2849
  // Relevance to actual AI queries
2619
- faq_section: 0.04,
2850
+ faq_section: 0.03,
2620
2851
  // Structured Q&A pairs
2621
2852
  // ─── Content Organization (~30%) ──────────────────────────────────────────
2622
2853
  // HOW easily AI engines can extract and trust your content.
@@ -2664,8 +2895,13 @@ var WEIGHTS = {
2664
2895
  // Clear entity boundaries
2665
2896
  extraction_friction: 0.02,
2666
2897
  // Sentence length, voice, jargon
2667
- image_context_ai: 0.01
2898
+ image_context_ai: 0.01,
2668
2899
  // Figure/figcaption, alt text quality
2900
+ // ─── V3 Criteria ────────────────────────────────────────────────────────
2901
+ duplicate_content: 0.05,
2902
+ // Duplicate text blocks within pages
2903
+ cross_page_duplication: 0.03
2904
+ // Same paragraphs copied across pages
2669
2905
  };
2670
2906
  function calculateOverallScore(criteria) {
2671
2907
  let totalWeight = 0;
@@ -2778,7 +3014,9 @@ var PILLARS = {
2778
3014
  "fact_density",
2779
3015
  "citation_ready_writing",
2780
3016
  "answer_first_placement",
2781
- "evidence_packaging"
3017
+ "evidence_packaging",
3018
+ "duplicate_content",
3019
+ "cross_page_duplication"
2782
3020
  ],
2783
3021
  "Content Structure": [
2784
3022
  "direct_answer_density",
@@ -2843,6 +3081,8 @@ var CLIENT_NAMES = {
2843
3081
  image_context_ai: "Image Context for AI",
2844
3082
  schema_coverage: "Schema Coverage",
2845
3083
  speakable_schema: "Speakable Schema",
3084
+ duplicate_content: "Duplicate Content Blocks",
3085
+ cross_page_duplication: "Cross-Page Duplicate Content",
2846
3086
  content_cannibalization: "Content Cannibalization",
2847
3087
  llms_txt: "llms.txt File",
2848
3088
  robots_txt: "robots.txt for AI",
@@ -2860,10 +3100,12 @@ var PILLAR_WEIGHTS = {
2860
3100
  citation_ready_writing: 0.04,
2861
3101
  answer_first_placement: 0.03,
2862
3102
  evidence_packaging: 0.03,
3103
+ duplicate_content: 0.05,
3104
+ cross_page_duplication: 0.03,
2863
3105
  direct_answer_density: 0.05,
2864
- qa_content_format: 0.05,
2865
- query_answer_alignment: 0.05,
2866
- faq_section: 0.04,
3106
+ qa_content_format: 0.04,
3107
+ query_answer_alignment: 0.04,
3108
+ faq_section: 0.03,
2867
3109
  table_list_extractability: 0.03,
2868
3110
  definition_patterns: 0.02,
2869
3111
  entity_disambiguation: 0.02,
@@ -2896,6 +3138,8 @@ var CRITERION_EFFORT = {
2896
3138
  citation_ready_writing: "Medium",
2897
3139
  answer_first_placement: "Medium",
2898
3140
  evidence_packaging: "Medium",
3141
+ duplicate_content: "Medium",
3142
+ cross_page_duplication: "Medium",
2899
3143
  direct_answer_density: "Medium",
2900
3144
  qa_content_format: "Medium",
2901
3145
  query_answer_alignment: "Medium",
@@ -2951,6 +3195,8 @@ var FIX_DESCRIPTIONS = {
2951
3195
  image_context_ai: "Wrap images in <figure>/<figcaption> with descriptive alt text.",
2952
3196
  schema_coverage: "Extend structured data to inner pages (articles, services, products).",
2953
3197
  speakable_schema: "Add SpeakableSpecification schema for voice assistant compatibility.",
3198
+ duplicate_content: "Rewrite duplicate text blocks so each section provides unique value.",
3199
+ cross_page_duplication: "Rewrite shared paragraphs across pages so each page has unique content.",
2954
3200
  content_cannibalization: "Consolidate overlapping pages or differentiate titles and H1 headings.",
2955
3201
  llms_txt: "Create a /llms.txt file describing your site for AI engines.",
2956
3202
  robots_txt: "Update robots.txt to explicitly allow AI crawlers.",
@@ -3046,7 +3292,9 @@ var CRITERION_LABELS = {
3046
3292
  "Evidence Packaging": "Evidence Packaging",
3047
3293
  "Entity Disambiguation": "Entity Disambiguation",
3048
3294
  "Extraction Friction Score": "Extraction Friction Score",
3049
- "Image Context for AI": "Image Context for AI"
3295
+ "Image Context for AI": "Image Context for AI",
3296
+ "Duplicate Content Blocks": "Duplicate Content Blocks",
3297
+ "Cross-Page Duplicate Content": "Cross-Page Duplicate Content"
3050
3298
  };
3051
3299
  function scoreToStatus(score) {
3052
3300
  if (score === 0) return "MISSING";
@@ -3141,9 +3389,9 @@ var CRITERION_WEIGHTS = {
3141
3389
  content_depth: 0.07,
3142
3390
  fact_density: 0.06,
3143
3391
  direct_answer_density: 0.05,
3144
- qa_content_format: 0.05,
3145
- query_answer_alignment: 0.05,
3146
- faq_section: 0.04,
3392
+ qa_content_format: 0.04,
3393
+ query_answer_alignment: 0.04,
3394
+ faq_section: 0.03,
3147
3395
  // Content Organization (~30%)
3148
3396
  entity_consistency: 0.05,
3149
3397
  internal_linking: 0.04,
@@ -3172,7 +3420,10 @@ var CRITERION_WEIGHTS = {
3172
3420
  evidence_packaging: 0.03,
3173
3421
  entity_disambiguation: 0.02,
3174
3422
  extraction_friction: 0.02,
3175
- image_context_ai: 0.01
3423
+ image_context_ai: 0.01,
3424
+ // V3 Criteria
3425
+ duplicate_content: 0.05,
3426
+ cross_page_duplication: 0.03
3176
3427
  };
3177
3428
  var OPPORTUNITY_TEMPLATES = {
3178
3429
  llms_txt: {
@@ -3295,6 +3546,16 @@ var OPPORTUNITY_TEMPLATES = {
3295
3546
  effort: "Medium",
3296
3547
  description: "Ensure every question-format heading (H2/H3) is followed by a direct answer paragraph. This pattern is ideal for AI engine snippet extraction."
3297
3548
  },
3549
+ duplicate_content: {
3550
+ name: "Fix Duplicate Content Blocks",
3551
+ effort: "Medium",
3552
+ description: "Sections within pages contain identical or near-identical text. LLMs may flag this as low-quality or thin content, reducing citation authority. Rewrite duplicate blocks with unique angles."
3553
+ },
3554
+ cross_page_duplication: {
3555
+ name: "Eliminate Cross-Page Duplicate Content",
3556
+ effort: "Medium",
3557
+ description: "The same paragraphs appear on multiple pages. AI engines may only index one version and ignore the rest. Rewrite shared content so each page offers a unique perspective."
3558
+ },
3298
3559
  content_cannibalization: {
3299
3560
  name: "Resolve Content Cannibalization",
3300
3561
  effort: "Medium",
@@ -3703,9 +3964,9 @@ var PAGE_CRITERIA = {
3703
3964
  original_data: { weight: 0.1, label: "Original Data & Expert Content" },
3704
3965
  fact_density: { weight: 0.06, label: "Fact & Data Density" },
3705
3966
  direct_answer_density: { weight: 0.05, label: "Direct Answer Paragraphs" },
3706
- qa_content_format: { weight: 0.05, label: "Q&A Content Format" },
3707
- query_answer_alignment: { weight: 0.05, label: "Query-Answer Alignment" },
3708
- faq_section: { weight: 0.04, label: "FAQ Section Content" },
3967
+ qa_content_format: { weight: 0.04, label: "Q&A Content Format" },
3968
+ query_answer_alignment: { weight: 0.04, label: "Query-Answer Alignment" },
3969
+ faq_section: { weight: 0.03, label: "FAQ Section Content" },
3709
3970
  // Content Organization
3710
3971
  content_freshness: { weight: 0.04, label: "Content Freshness Signals" },
3711
3972
  schema_markup: { weight: 0.03, label: "Schema.org Structured Data" },
@@ -3722,7 +3983,8 @@ var PAGE_CRITERIA = {
3722
3983
  evidence_packaging: { weight: 0.03, label: "Evidence Packaging" },
3723
3984
  entity_disambiguation: { weight: 0.02, label: "Entity Disambiguation" },
3724
3985
  extraction_friction: { weight: 0.02, label: "Extraction Friction Score" },
3725
- image_context_ai: { weight: 0.01, label: "Image Context for AI" }
3986
+ image_context_ai: { weight: 0.01, label: "Image Context for AI" },
3987
+ duplicate_content: { weight: 0.05, label: "Duplicate Content Blocks" }
3726
3988
  };
3727
3989
  function extractJsonLdBlocks(html) {
3728
3990
  const blocks = [];
@@ -4171,6 +4433,90 @@ function scoreImageContextAI(html) {
4171
4433
  if (contextualImages.length > 0) score += 3;
4172
4434
  return cap(score, 10);
4173
4435
  }
4436
+ var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
4437
+ function isBoilerplate(text) {
4438
+ const words = text.split(/\s+/).length;
4439
+ if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
4440
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
4441
+ return false;
4442
+ }
4443
+ function scoreDuplicateContent(html) {
4444
+ return scoreDuplicateContentDetailed(html).score;
4445
+ }
4446
+ function scoreDuplicateContentDetailed(html) {
4447
+ const sections = extractSectionsWithParagraphs(html);
4448
+ if (sections.length < 2) return { score: 10, duplicates: [] };
4449
+ const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
4450
+ const duplicates = [];
4451
+ let dupParagraphCount = 0;
4452
+ for (let i = 0; i < sections.length; i++) {
4453
+ for (let j = i + 1; j < sections.length; j++) {
4454
+ for (const pA of sections[i].paragraphs) {
4455
+ for (const pB of sections[j].paragraphs) {
4456
+ const sim = shingleJaccard(pA.shingles, pB.shingles);
4457
+ if (sim > 0.4) {
4458
+ dupParagraphCount++;
4459
+ duplicates.push({
4460
+ headingA: sections[i].heading,
4461
+ headingB: sections[j].heading,
4462
+ similarity: Math.round(sim * 100),
4463
+ sample: pA.text.slice(0, 80)
4464
+ });
4465
+ break;
4466
+ }
4467
+ }
4468
+ }
4469
+ }
4470
+ }
4471
+ if (dupParagraphCount === 0) return { score: 10, duplicates: [] };
4472
+ const dupRatio = totalParagraphs > 0 ? dupParagraphCount / totalParagraphs : 0;
4473
+ let score;
4474
+ if (dupParagraphCount === 1 && dupRatio <= 0.05) {
4475
+ score = 6;
4476
+ } else if (dupParagraphCount === 1) {
4477
+ score = 4;
4478
+ } else if (dupParagraphCount === 2) {
4479
+ score = 2;
4480
+ } else {
4481
+ score = 0;
4482
+ }
4483
+ return { score, duplicates };
4484
+ }
4485
+ function extractSectionsWithParagraphs(html) {
4486
+ const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
4487
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
4488
+ const sections = [];
4489
+ for (const part of parts) {
4490
+ const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
4491
+ const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
4492
+ const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
4493
+ const paragraphs = pMatches.map((p) => {
4494
+ const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
4495
+ return { text, shingles: buildShingles(text, 4) };
4496
+ }).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
4497
+ if (paragraphs.length > 0) {
4498
+ sections.push({ heading, paragraphs });
4499
+ }
4500
+ }
4501
+ return sections;
4502
+ }
4503
+ function buildShingles(text, n) {
4504
+ const words = text.split(/\s+/).filter((w) => w.length > 1);
4505
+ const shingles = /* @__PURE__ */ new Set();
4506
+ for (let i = 0; i <= words.length - n; i++) {
4507
+ shingles.add(words.slice(i, i + n).join(" "));
4508
+ }
4509
+ return shingles;
4510
+ }
4511
+ function shingleJaccard(a, b) {
4512
+ if (a.size === 0 && b.size === 0) return 0;
4513
+ let intersection = 0;
4514
+ for (const s of a) {
4515
+ if (b.has(s)) intersection++;
4516
+ }
4517
+ const union = a.size + b.size - intersection;
4518
+ return union === 0 ? 0 : intersection / union;
4519
+ }
4174
4520
  var SCORING_FUNCTIONS = {
4175
4521
  schema_markup: scoreSchemaMarkup,
4176
4522
  qa_content_format: scoreQAFormat,
@@ -4191,7 +4537,8 @@ var SCORING_FUNCTIONS = {
4191
4537
  evidence_packaging: scoreEvidencePackaging,
4192
4538
  entity_disambiguation: scoreEntityDisambiguation,
4193
4539
  extraction_friction: scoreExtractionFriction,
4194
- image_context_ai: scoreImageContextAI
4540
+ image_context_ai: scoreImageContextAI,
4541
+ duplicate_content: scoreDuplicateContent
4195
4542
  };
4196
4543
  function scorePage(html, url) {
4197
4544
  let totalWeight = 0;
@@ -4205,6 +4552,11 @@ function scorePage(html, url) {
4205
4552
  totalWeight += weight;
4206
4553
  }
4207
4554
  let aeoScore = totalWeight === 0 ? 0 : Math.round(weightedSum / totalWeight);
4555
+ const dupScore = criterionScores.find((c) => c.criterion === "duplicate_content")?.score ?? 10;
4556
+ if (dupScore <= 6) {
4557
+ const dupCap = 35 + dupScore * 5;
4558
+ aeoScore = Math.min(aeoScore, dupCap);
4559
+ }
4208
4560
  const scoreCapped = aeoScore > 75;
4209
4561
  if (scoreCapped) aeoScore = 75;
4210
4562
  return { aeoScore, criterionScores, scoreCapped };
@@ -4410,6 +4762,15 @@ function checkHasCitationReadyContent(html) {
4410
4762
  }
4411
4763
  return null;
4412
4764
  }
4765
+ function checkDuplicateContentBlocks(html) {
4766
+ const { score, duplicates } = scoreDuplicateContentDetailed(html);
4767
+ if (score <= 6 && duplicates.length > 0) {
4768
+ const first = duplicates[0];
4769
+ const label = duplicates.length === 1 ? `Duplicate content: '${first.headingA}' and '${first.headingB}' share ${first.similarity}% similar text ("${first.sample}...")` : `${duplicates.length} duplicate blocks found (e.g. '${first.headingA}' and '${first.headingB}' \u2014 "${first.sample}...")`;
4770
+ return { check: "duplicate-content", label, severity: score <= 3 ? "error" : "warning" };
4771
+ }
4772
+ return null;
4773
+ }
4413
4774
  function analyzePage(html, url, category) {
4414
4775
  const title = extractTitle(html);
4415
4776
  const textContent = getTextContent2(html);
@@ -4428,7 +4789,8 @@ function analyzePage(html, url, category) {
4428
4789
  checkImagesMissingAlt(html),
4429
4790
  checkNoInternalLinks(html, url),
4430
4791
  checkNoAnswerBlock(html),
4431
- checkNoEvidence(html, url)
4792
+ checkNoEvidence(html, url),
4793
+ checkDuplicateContentBlocks(html)
4432
4794
  ];
4433
4795
  for (const result of issueChecks) {
4434
4796
  if (result) issues.push(result);
@@ -5060,6 +5422,10 @@ function printSummary(result) {
5060
5422
  const issueLabel = issueCount === 0 ? "0 issues" : issueCount === 1 ? "1 issue" : `${issueCount} issues`;
5061
5423
  const aeoLabel = page.aeoScore != null ? ` [AEO: ${page.aeoScore}]` : "";
5062
5424
  log(` ${cat.padEnd(10)} ${page.url.padEnd(50)} ${issueLabel}${aeoLabel}`);
5425
+ const dupIssue = page.issues.find((i) => i.check === "duplicate-content");
5426
+ if (dupIssue) {
5427
+ log(` \u26A0 ${dupIssue.label}`);
5428
+ }
5063
5429
  }
5064
5430
  const scored = result.pagesReviewed.filter((p) => p.aeoScore != null);
5065
5431
  if (scored.length > 0) {