aeorank 3.1.0 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/browser.js +76 -97
- package/dist/browser.js.map +1 -1
- package/dist/cli.js +76 -97
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +76 -97
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +76 -97
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -183,7 +183,7 @@ Use the built-in action to gate deployments on AEO score:
|
|
|
183
183
|
|
|
184
184
|
```yaml
|
|
185
185
|
- name: AEO Audit
|
|
186
|
-
uses: AEO-Content-Inc/aeorank@
|
|
186
|
+
uses: AEO-Content-Inc/aeorank@v3
|
|
187
187
|
with:
|
|
188
188
|
domain: example.com
|
|
189
189
|
threshold: 70
|
|
@@ -574,6 +574,10 @@ console.log(result.comparison.tied); // Criteria with equal scores
|
|
|
574
574
|
|
|
575
575
|
## Changelog
|
|
576
576
|
|
|
577
|
+
### v3.1.1 - Duplicate Detection False-Positive Fix
|
|
578
|
+
|
|
579
|
+
Duplicate-content detection now ignores short metadata rows like `Deadline:` and `Decision timeline:` so structured guides do not get penalized for repeated timeline labels. Shared duplicate-matching logic is now used by both page scoring and site-wide crawling.
|
|
580
|
+
|
|
577
581
|
### v3.1.0 - Duplicate Content Detection
|
|
578
582
|
|
|
579
583
|
2 new criteria (#35-#36): Duplicate Content Blocks (intra-page, 5%) and Cross-Page Duplicate Content (3%). Detects identical text blocks within pages and copy-pasted paragraphs across pages using shingle-based Jaccard similarity. Boilerplate filtering excludes CTAs, signups, and template content from false positives. Duplication gate caps per-page scores when severe duplication is found. CLI now shows duplicate section names inline per page.
|
package/dist/browser.js
CHANGED
|
@@ -78,6 +78,75 @@ function detectParkedDomain(bodySnippet) {
|
|
|
78
78
|
return { isParked: false };
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
+
// src/duplicate-content.ts
|
|
82
|
+
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
83
|
+
var MIN_SUBSTANTIVE_WORDS = 15;
|
|
84
|
+
var MAX_METADATA_WORDS = 24;
|
|
85
|
+
var MAX_METADATA_LABEL_WORDS = 4;
|
|
86
|
+
function normalizeParagraphText(htmlFragment) {
|
|
87
|
+
return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
88
|
+
}
|
|
89
|
+
function tokenize(text) {
|
|
90
|
+
return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
|
|
91
|
+
}
|
|
92
|
+
function isBoilerplateParagraph(text, words) {
|
|
93
|
+
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
94
|
+
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
95
|
+
return false;
|
|
96
|
+
}
|
|
97
|
+
function isMetadataParagraph(text, words) {
|
|
98
|
+
const labelMatch = text.match(/^([^:]{1,60}):\s+/);
|
|
99
|
+
if (!labelMatch) return false;
|
|
100
|
+
const labelWords = tokenize(labelMatch[1]).length;
|
|
101
|
+
return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
|
|
102
|
+
}
|
|
103
|
+
function buildShinglesFromTokens(words, n = 4) {
|
|
104
|
+
const shingles = /* @__PURE__ */ new Set();
|
|
105
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
106
|
+
shingles.add(words.slice(i, i + n).join(" "));
|
|
107
|
+
}
|
|
108
|
+
return shingles;
|
|
109
|
+
}
|
|
110
|
+
function createParagraph(htmlFragment) {
|
|
111
|
+
const text = normalizeParagraphText(htmlFragment);
|
|
112
|
+
const words = tokenize(text);
|
|
113
|
+
if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
|
|
114
|
+
if (isBoilerplateParagraph(text, words.length)) return null;
|
|
115
|
+
if (isMetadataParagraph(text, words.length)) return null;
|
|
116
|
+
const shingles = buildShinglesFromTokens(words);
|
|
117
|
+
if (shingles.size < 3) return null;
|
|
118
|
+
return { text, shingles };
|
|
119
|
+
}
|
|
120
|
+
function stripNonContentHtml(html) {
|
|
121
|
+
return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
122
|
+
}
|
|
123
|
+
function extractDuplicateContentParagraphs(html) {
|
|
124
|
+
const cleaned = stripNonContentHtml(html);
|
|
125
|
+
const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
126
|
+
return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
127
|
+
}
|
|
128
|
+
function extractDuplicateContentSections(html) {
|
|
129
|
+
const cleaned = stripNonContentHtml(html);
|
|
130
|
+
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
131
|
+
const sections = [];
|
|
132
|
+
for (const part of parts) {
|
|
133
|
+
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
134
|
+
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
135
|
+
const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
136
|
+
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
137
|
+
}
|
|
138
|
+
return sections;
|
|
139
|
+
}
|
|
140
|
+
function shingleJaccardSimilarity(a, b) {
|
|
141
|
+
if (a.size === 0 && b.size === 0) return 0;
|
|
142
|
+
let intersection = 0;
|
|
143
|
+
for (const shingle of a) {
|
|
144
|
+
if (b.has(shingle)) intersection++;
|
|
145
|
+
}
|
|
146
|
+
const union = a.size + b.size - intersection;
|
|
147
|
+
return union === 0 ? 0 : intersection / union;
|
|
148
|
+
}
|
|
149
|
+
|
|
81
150
|
// src/site-crawler.ts
|
|
82
151
|
async function fetchText(url) {
|
|
83
152
|
try {
|
|
@@ -2558,56 +2627,8 @@ function checkImageContextAI(data) {
|
|
|
2558
2627
|
}
|
|
2559
2628
|
return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
|
|
2560
2629
|
}
|
|
2561
|
-
var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
2562
|
-
function isBoilerplateParagraph(text) {
|
|
2563
|
-
const words = text.split(/\s+/).length;
|
|
2564
|
-
if (words < 20 && BOILERPLATE_RE.test(text)) return true;
|
|
2565
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
2566
|
-
return false;
|
|
2567
|
-
}
|
|
2568
|
-
function toShingles(text, n = 4) {
|
|
2569
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
2570
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
2571
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
2572
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
2573
|
-
}
|
|
2574
|
-
return shingles;
|
|
2575
|
-
}
|
|
2576
|
-
function shingleSimilarity(a, b) {
|
|
2577
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
2578
|
-
let intersection = 0;
|
|
2579
|
-
for (const s of a) {
|
|
2580
|
-
if (b.has(s)) intersection++;
|
|
2581
|
-
}
|
|
2582
|
-
const union = a.size + b.size - intersection;
|
|
2583
|
-
return union === 0 ? 0 : intersection / union;
|
|
2584
|
-
}
|
|
2585
|
-
function extractPageParagraphs(html) {
|
|
2586
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2587
|
-
const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2588
|
-
return pMatches.map((p) => {
|
|
2589
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2590
|
-
return { text, shingles: toShingles(text) };
|
|
2591
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2592
|
-
}
|
|
2593
|
-
function splitIntoSectionsWithParagraphs(html) {
|
|
2594
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2595
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
2596
|
-
const sections = [];
|
|
2597
|
-
for (const part of parts) {
|
|
2598
|
-
const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
2599
|
-
const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
2600
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2601
|
-
const paragraphs = pMatches.map((p) => {
|
|
2602
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2603
|
-
return { text, shingles: toShingles(text) };
|
|
2604
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2605
|
-
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
2606
|
-
}
|
|
2607
|
-
return sections;
|
|
2608
|
-
}
|
|
2609
2630
|
function findIntraPageDuplicates(html) {
|
|
2610
|
-
const sections =
|
|
2631
|
+
const sections = extractDuplicateContentSections(html);
|
|
2611
2632
|
if (sections.length < 2) return [];
|
|
2612
2633
|
const pairs = [];
|
|
2613
2634
|
for (let i = 0; i < sections.length; i++) {
|
|
@@ -2616,7 +2637,7 @@ function findIntraPageDuplicates(html) {
|
|
|
2616
2637
|
for (const pA of sections[i].paragraphs) {
|
|
2617
2638
|
if (found) break;
|
|
2618
2639
|
for (const pB of sections[j].paragraphs) {
|
|
2619
|
-
const sim =
|
|
2640
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2620
2641
|
if (sim > 0.4) {
|
|
2621
2642
|
pairs.push({
|
|
2622
2643
|
headingA: sections[i].heading,
|
|
@@ -2696,11 +2717,11 @@ function checkCrossPageDuplication(data) {
|
|
|
2696
2717
|
const findings = [];
|
|
2697
2718
|
const pages = [];
|
|
2698
2719
|
if (data.homepage) {
|
|
2699
|
-
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs:
|
|
2720
|
+
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
|
|
2700
2721
|
}
|
|
2701
2722
|
if (data.blogSample) {
|
|
2702
2723
|
for (const page of data.blogSample) {
|
|
2703
|
-
pages.push({ url: page.finalUrl || "", paragraphs:
|
|
2724
|
+
pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
|
|
2704
2725
|
}
|
|
2705
2726
|
}
|
|
2706
2727
|
if (pages.length <= 1) {
|
|
@@ -2732,7 +2753,7 @@ function checkCrossPageDuplication(data) {
|
|
|
2732
2753
|
const fpA = [...pA.shingles].slice(0, 5).join("|");
|
|
2733
2754
|
if (siteBoilerprints.has(fpA)) continue;
|
|
2734
2755
|
for (const pB of pages[j].paragraphs) {
|
|
2735
|
-
const sim =
|
|
2756
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2736
2757
|
if (sim > 0.4) {
|
|
2737
2758
|
dupCount++;
|
|
2738
2759
|
if (!sample) sample = pA.text.slice(0, 80);
|
|
@@ -4357,18 +4378,11 @@ function scoreImageContextAI(html) {
|
|
|
4357
4378
|
if (contextualImages.length > 0) score += 3;
|
|
4358
4379
|
return cap(score, 10);
|
|
4359
4380
|
}
|
|
4360
|
-
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
4361
|
-
function isBoilerplate(text) {
|
|
4362
|
-
const words = text.split(/\s+/).length;
|
|
4363
|
-
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
4364
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
4365
|
-
return false;
|
|
4366
|
-
}
|
|
4367
4381
|
function scoreDuplicateContent(html) {
|
|
4368
4382
|
return scoreDuplicateContentDetailed(html).score;
|
|
4369
4383
|
}
|
|
4370
4384
|
function scoreDuplicateContentDetailed(html) {
|
|
4371
|
-
const sections =
|
|
4385
|
+
const sections = extractDuplicateContentSections(html);
|
|
4372
4386
|
if (sections.length < 2) return { score: 10, duplicates: [] };
|
|
4373
4387
|
const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
|
|
4374
4388
|
const duplicates = [];
|
|
@@ -4377,7 +4391,7 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4377
4391
|
for (let j = i + 1; j < sections.length; j++) {
|
|
4378
4392
|
for (const pA of sections[i].paragraphs) {
|
|
4379
4393
|
for (const pB of sections[j].paragraphs) {
|
|
4380
|
-
const sim =
|
|
4394
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
4381
4395
|
if (sim > 0.4) {
|
|
4382
4396
|
dupParagraphCount++;
|
|
4383
4397
|
duplicates.push({
|
|
@@ -4406,41 +4420,6 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4406
4420
|
}
|
|
4407
4421
|
return { score, duplicates };
|
|
4408
4422
|
}
|
|
4409
|
-
function extractSectionsWithParagraphs(html) {
|
|
4410
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
4411
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
4412
|
-
const sections = [];
|
|
4413
|
-
for (const part of parts) {
|
|
4414
|
-
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
4415
|
-
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
4416
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
4417
|
-
const paragraphs = pMatches.map((p) => {
|
|
4418
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
4419
|
-
return { text, shingles: buildShingles(text, 4) };
|
|
4420
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
|
|
4421
|
-
if (paragraphs.length > 0) {
|
|
4422
|
-
sections.push({ heading, paragraphs });
|
|
4423
|
-
}
|
|
4424
|
-
}
|
|
4425
|
-
return sections;
|
|
4426
|
-
}
|
|
4427
|
-
function buildShingles(text, n) {
|
|
4428
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
4429
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
4430
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
4431
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
4432
|
-
}
|
|
4433
|
-
return shingles;
|
|
4434
|
-
}
|
|
4435
|
-
function shingleJaccard(a, b) {
|
|
4436
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
4437
|
-
let intersection = 0;
|
|
4438
|
-
for (const s of a) {
|
|
4439
|
-
if (b.has(s)) intersection++;
|
|
4440
|
-
}
|
|
4441
|
-
const union = a.size + b.size - intersection;
|
|
4442
|
-
return union === 0 ? 0 : intersection / union;
|
|
4443
|
-
}
|
|
4444
4423
|
var SCORING_FUNCTIONS = {
|
|
4445
4424
|
schema_markup: scoreSchemaMarkup,
|
|
4446
4425
|
qa_content_format: scoreQAFormat,
|