aeorank 3.1.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -183,7 +183,7 @@ Use the built-in action to gate deployments on AEO score:
183
183
 
184
184
  ```yaml
185
185
  - name: AEO Audit
186
- uses: AEO-Content-Inc/aeorank@v2
186
+ uses: AEO-Content-Inc/aeorank@v3
187
187
  with:
188
188
  domain: example.com
189
189
  threshold: 70
@@ -574,6 +574,10 @@ console.log(result.comparison.tied); // Criteria with equal scores
574
574
 
575
575
  ## Changelog
576
576
 
577
+ ### v3.1.1 - Duplicate Detection False-Positive Fix
578
+
579
+ Duplicate-content detection now ignores short metadata rows like `Deadline:` and `Decision timeline:` so structured guides do not get penalized for repeated timeline labels. Shared duplicate-matching logic is now used by both page scoring and site-wide crawling.
580
+
577
581
  ### v3.1.0 - Duplicate Content Detection
578
582
 
579
583
  2 new criteria (#35-#36): Duplicate Content Blocks (intra-page, 5%) and Cross-Page Duplicate Content (3%). Detects identical text blocks within pages and copy-pasted paragraphs across pages using shingle-based Jaccard similarity. Boilerplate filtering excludes CTAs, signups, and template content from false positives. Duplication gate caps per-page scores when severe duplication is found. CLI now shows duplicate section names inline per page.
package/dist/browser.js CHANGED
@@ -78,6 +78,75 @@ function detectParkedDomain(bodySnippet) {
78
78
  return { isParked: false };
79
79
  }
80
80
 
81
+ // src/duplicate-content.ts
82
+ var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
83
+ var MIN_SUBSTANTIVE_WORDS = 15;
84
+ var MAX_METADATA_WORDS = 24;
85
+ var MAX_METADATA_LABEL_WORDS = 4;
86
+ function normalizeParagraphText(htmlFragment) {
87
+ return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
88
+ }
89
+ function tokenize(text) {
90
+ return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
91
+ }
92
+ function isBoilerplateParagraph(text, words) {
93
+ if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
94
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
95
+ return false;
96
+ }
97
+ function isMetadataParagraph(text, words) {
98
+ const labelMatch = text.match(/^([^:]{1,60}):\s+/);
99
+ if (!labelMatch) return false;
100
+ const labelWords = tokenize(labelMatch[1]).length;
101
+ return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
102
+ }
103
+ function buildShinglesFromTokens(words, n = 4) {
104
+ const shingles = /* @__PURE__ */ new Set();
105
+ for (let i = 0; i <= words.length - n; i++) {
106
+ shingles.add(words.slice(i, i + n).join(" "));
107
+ }
108
+ return shingles;
109
+ }
110
+ function createParagraph(htmlFragment) {
111
+ const text = normalizeParagraphText(htmlFragment);
112
+ const words = tokenize(text);
113
+ if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
114
+ if (isBoilerplateParagraph(text, words.length)) return null;
115
+ if (isMetadataParagraph(text, words.length)) return null;
116
+ const shingles = buildShinglesFromTokens(words);
117
+ if (shingles.size < 3) return null;
118
+ return { text, shingles };
119
+ }
120
+ function stripNonContentHtml(html) {
121
+ return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
122
+ }
123
+ function extractDuplicateContentParagraphs(html) {
124
+ const cleaned = stripNonContentHtml(html);
125
+ const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
126
+ return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
127
+ }
128
+ function extractDuplicateContentSections(html) {
129
+ const cleaned = stripNonContentHtml(html);
130
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
131
+ const sections = [];
132
+ for (const part of parts) {
133
+ const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
134
+ const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
135
+ const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
136
+ if (paragraphs.length > 0) sections.push({ heading, paragraphs });
137
+ }
138
+ return sections;
139
+ }
140
+ function shingleJaccardSimilarity(a, b) {
141
+ if (a.size === 0 && b.size === 0) return 0;
142
+ let intersection = 0;
143
+ for (const shingle of a) {
144
+ if (b.has(shingle)) intersection++;
145
+ }
146
+ const union = a.size + b.size - intersection;
147
+ return union === 0 ? 0 : intersection / union;
148
+ }
149
+
81
150
  // src/site-crawler.ts
82
151
  async function fetchText(url) {
83
152
  try {
@@ -2558,56 +2627,8 @@ function checkImageContextAI(data) {
2558
2627
  }
2559
2628
  return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
2560
2629
  }
2561
- var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
2562
- function isBoilerplateParagraph(text) {
2563
- const words = text.split(/\s+/).length;
2564
- if (words < 20 && BOILERPLATE_RE.test(text)) return true;
2565
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
2566
- return false;
2567
- }
2568
- function toShingles(text, n = 4) {
2569
- const words = text.split(/\s+/).filter((w) => w.length > 1);
2570
- const shingles = /* @__PURE__ */ new Set();
2571
- for (let i = 0; i <= words.length - n; i++) {
2572
- shingles.add(words.slice(i, i + n).join(" "));
2573
- }
2574
- return shingles;
2575
- }
2576
- function shingleSimilarity(a, b) {
2577
- if (a.size === 0 && b.size === 0) return 0;
2578
- let intersection = 0;
2579
- for (const s of a) {
2580
- if (b.has(s)) intersection++;
2581
- }
2582
- const union = a.size + b.size - intersection;
2583
- return union === 0 ? 0 : intersection / union;
2584
- }
2585
- function extractPageParagraphs(html) {
2586
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2587
- const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2588
- return pMatches.map((p) => {
2589
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2590
- return { text, shingles: toShingles(text) };
2591
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2592
- }
2593
- function splitIntoSectionsWithParagraphs(html) {
2594
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2595
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
2596
- const sections = [];
2597
- for (const part of parts) {
2598
- const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
2599
- const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
2600
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2601
- const paragraphs = pMatches.map((p) => {
2602
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2603
- return { text, shingles: toShingles(text) };
2604
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2605
- if (paragraphs.length > 0) sections.push({ heading, paragraphs });
2606
- }
2607
- return sections;
2608
- }
2609
2630
  function findIntraPageDuplicates(html) {
2610
- const sections = splitIntoSectionsWithParagraphs(html);
2631
+ const sections = extractDuplicateContentSections(html);
2611
2632
  if (sections.length < 2) return [];
2612
2633
  const pairs = [];
2613
2634
  for (let i = 0; i < sections.length; i++) {
@@ -2616,7 +2637,7 @@ function findIntraPageDuplicates(html) {
2616
2637
  for (const pA of sections[i].paragraphs) {
2617
2638
  if (found) break;
2618
2639
  for (const pB of sections[j].paragraphs) {
2619
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
2640
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2620
2641
  if (sim > 0.4) {
2621
2642
  pairs.push({
2622
2643
  headingA: sections[i].heading,
@@ -2696,11 +2717,11 @@ function checkCrossPageDuplication(data) {
2696
2717
  const findings = [];
2697
2718
  const pages = [];
2698
2719
  if (data.homepage) {
2699
- pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractPageParagraphs(data.homepage.text) });
2720
+ pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
2700
2721
  }
2701
2722
  if (data.blogSample) {
2702
2723
  for (const page of data.blogSample) {
2703
- pages.push({ url: page.finalUrl || "", paragraphs: extractPageParagraphs(page.text) });
2724
+ pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
2704
2725
  }
2705
2726
  }
2706
2727
  if (pages.length <= 1) {
@@ -2732,7 +2753,7 @@ function checkCrossPageDuplication(data) {
2732
2753
  const fpA = [...pA.shingles].slice(0, 5).join("|");
2733
2754
  if (siteBoilerprints.has(fpA)) continue;
2734
2755
  for (const pB of pages[j].paragraphs) {
2735
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
2756
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2736
2757
  if (sim > 0.4) {
2737
2758
  dupCount++;
2738
2759
  if (!sample) sample = pA.text.slice(0, 80);
@@ -4357,18 +4378,11 @@ function scoreImageContextAI(html) {
4357
4378
  if (contextualImages.length > 0) score += 3;
4358
4379
  return cap(score, 10);
4359
4380
  }
4360
- var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
4361
- function isBoilerplate(text) {
4362
- const words = text.split(/\s+/).length;
4363
- if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
4364
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
4365
- return false;
4366
- }
4367
4381
  function scoreDuplicateContent(html) {
4368
4382
  return scoreDuplicateContentDetailed(html).score;
4369
4383
  }
4370
4384
  function scoreDuplicateContentDetailed(html) {
4371
- const sections = extractSectionsWithParagraphs(html);
4385
+ const sections = extractDuplicateContentSections(html);
4372
4386
  if (sections.length < 2) return { score: 10, duplicates: [] };
4373
4387
  const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
4374
4388
  const duplicates = [];
@@ -4377,7 +4391,7 @@ function scoreDuplicateContentDetailed(html) {
4377
4391
  for (let j = i + 1; j < sections.length; j++) {
4378
4392
  for (const pA of sections[i].paragraphs) {
4379
4393
  for (const pB of sections[j].paragraphs) {
4380
- const sim = shingleJaccard(pA.shingles, pB.shingles);
4394
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
4381
4395
  if (sim > 0.4) {
4382
4396
  dupParagraphCount++;
4383
4397
  duplicates.push({
@@ -4406,41 +4420,6 @@ function scoreDuplicateContentDetailed(html) {
4406
4420
  }
4407
4421
  return { score, duplicates };
4408
4422
  }
4409
- function extractSectionsWithParagraphs(html) {
4410
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
4411
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
4412
- const sections = [];
4413
- for (const part of parts) {
4414
- const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
4415
- const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
4416
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
4417
- const paragraphs = pMatches.map((p) => {
4418
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
4419
- return { text, shingles: buildShingles(text, 4) };
4420
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
4421
- if (paragraphs.length > 0) {
4422
- sections.push({ heading, paragraphs });
4423
- }
4424
- }
4425
- return sections;
4426
- }
4427
- function buildShingles(text, n) {
4428
- const words = text.split(/\s+/).filter((w) => w.length > 1);
4429
- const shingles = /* @__PURE__ */ new Set();
4430
- for (let i = 0; i <= words.length - n; i++) {
4431
- shingles.add(words.slice(i, i + n).join(" "));
4432
- }
4433
- return shingles;
4434
- }
4435
- function shingleJaccard(a, b) {
4436
- if (a.size === 0 && b.size === 0) return 0;
4437
- let intersection = 0;
4438
- for (const s of a) {
4439
- if (b.has(s)) intersection++;
4440
- }
4441
- const union = a.size + b.size - intersection;
4442
- return union === 0 ? 0 : intersection / union;
4443
- }
4444
4423
  var SCORING_FUNCTIONS = {
4445
4424
  schema_markup: scoreSchemaMarkup,
4446
4425
  qa_content_format: scoreQAFormat,