aeorank 3.1.0 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/browser.js +76 -97
- package/dist/browser.js.map +1 -1
- package/dist/cli.js +76 -97
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +76 -97
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +76 -97
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -78,6 +78,75 @@ function detectParkedDomain(bodySnippet) {
|
|
|
78
78
|
return { isParked: false };
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
+
// src/duplicate-content.ts
|
|
82
|
+
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
83
|
+
var MIN_SUBSTANTIVE_WORDS = 15;
|
|
84
|
+
var MAX_METADATA_WORDS = 24;
|
|
85
|
+
var MAX_METADATA_LABEL_WORDS = 4;
|
|
86
|
+
function normalizeParagraphText(htmlFragment) {
|
|
87
|
+
return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
88
|
+
}
|
|
89
|
+
function tokenize(text) {
|
|
90
|
+
return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
|
|
91
|
+
}
|
|
92
|
+
function isBoilerplateParagraph(text, words) {
|
|
93
|
+
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
94
|
+
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
95
|
+
return false;
|
|
96
|
+
}
|
|
97
|
+
function isMetadataParagraph(text, words) {
|
|
98
|
+
const labelMatch = text.match(/^([^:]{1,60}):\s+/);
|
|
99
|
+
if (!labelMatch) return false;
|
|
100
|
+
const labelWords = tokenize(labelMatch[1]).length;
|
|
101
|
+
return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
|
|
102
|
+
}
|
|
103
|
+
function buildShinglesFromTokens(words, n = 4) {
|
|
104
|
+
const shingles = /* @__PURE__ */ new Set();
|
|
105
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
106
|
+
shingles.add(words.slice(i, i + n).join(" "));
|
|
107
|
+
}
|
|
108
|
+
return shingles;
|
|
109
|
+
}
|
|
110
|
+
function createParagraph(htmlFragment) {
|
|
111
|
+
const text = normalizeParagraphText(htmlFragment);
|
|
112
|
+
const words = tokenize(text);
|
|
113
|
+
if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
|
|
114
|
+
if (isBoilerplateParagraph(text, words.length)) return null;
|
|
115
|
+
if (isMetadataParagraph(text, words.length)) return null;
|
|
116
|
+
const shingles = buildShinglesFromTokens(words);
|
|
117
|
+
if (shingles.size < 3) return null;
|
|
118
|
+
return { text, shingles };
|
|
119
|
+
}
|
|
120
|
+
function stripNonContentHtml(html) {
|
|
121
|
+
return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
122
|
+
}
|
|
123
|
+
function extractDuplicateContentParagraphs(html) {
|
|
124
|
+
const cleaned = stripNonContentHtml(html);
|
|
125
|
+
const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
126
|
+
return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
127
|
+
}
|
|
128
|
+
function extractDuplicateContentSections(html) {
|
|
129
|
+
const cleaned = stripNonContentHtml(html);
|
|
130
|
+
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
131
|
+
const sections = [];
|
|
132
|
+
for (const part of parts) {
|
|
133
|
+
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
134
|
+
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
135
|
+
const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
136
|
+
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
137
|
+
}
|
|
138
|
+
return sections;
|
|
139
|
+
}
|
|
140
|
+
function shingleJaccardSimilarity(a, b) {
|
|
141
|
+
if (a.size === 0 && b.size === 0) return 0;
|
|
142
|
+
let intersection = 0;
|
|
143
|
+
for (const shingle of a) {
|
|
144
|
+
if (b.has(shingle)) intersection++;
|
|
145
|
+
}
|
|
146
|
+
const union = a.size + b.size - intersection;
|
|
147
|
+
return union === 0 ? 0 : intersection / union;
|
|
148
|
+
}
|
|
149
|
+
|
|
81
150
|
// src/site-crawler.ts
|
|
82
151
|
async function fetchText(url) {
|
|
83
152
|
try {
|
|
@@ -2558,56 +2627,8 @@ function checkImageContextAI(data) {
|
|
|
2558
2627
|
}
|
|
2559
2628
|
return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
|
|
2560
2629
|
}
|
|
2561
|
-
var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
2562
|
-
function isBoilerplateParagraph(text) {
|
|
2563
|
-
const words = text.split(/\s+/).length;
|
|
2564
|
-
if (words < 20 && BOILERPLATE_RE.test(text)) return true;
|
|
2565
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
2566
|
-
return false;
|
|
2567
|
-
}
|
|
2568
|
-
function toShingles(text, n = 4) {
|
|
2569
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
2570
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
2571
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
2572
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
2573
|
-
}
|
|
2574
|
-
return shingles;
|
|
2575
|
-
}
|
|
2576
|
-
function shingleSimilarity(a, b) {
|
|
2577
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
2578
|
-
let intersection = 0;
|
|
2579
|
-
for (const s of a) {
|
|
2580
|
-
if (b.has(s)) intersection++;
|
|
2581
|
-
}
|
|
2582
|
-
const union = a.size + b.size - intersection;
|
|
2583
|
-
return union === 0 ? 0 : intersection / union;
|
|
2584
|
-
}
|
|
2585
|
-
function extractPageParagraphs(html) {
|
|
2586
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2587
|
-
const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2588
|
-
return pMatches.map((p) => {
|
|
2589
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2590
|
-
return { text, shingles: toShingles(text) };
|
|
2591
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2592
|
-
}
|
|
2593
|
-
function splitIntoSectionsWithParagraphs(html) {
|
|
2594
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2595
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
2596
|
-
const sections = [];
|
|
2597
|
-
for (const part of parts) {
|
|
2598
|
-
const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
2599
|
-
const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
2600
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2601
|
-
const paragraphs = pMatches.map((p) => {
|
|
2602
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2603
|
-
return { text, shingles: toShingles(text) };
|
|
2604
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2605
|
-
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
2606
|
-
}
|
|
2607
|
-
return sections;
|
|
2608
|
-
}
|
|
2609
2630
|
function findIntraPageDuplicates(html) {
|
|
2610
|
-
const sections =
|
|
2631
|
+
const sections = extractDuplicateContentSections(html);
|
|
2611
2632
|
if (sections.length < 2) return [];
|
|
2612
2633
|
const pairs = [];
|
|
2613
2634
|
for (let i = 0; i < sections.length; i++) {
|
|
@@ -2616,7 +2637,7 @@ function findIntraPageDuplicates(html) {
|
|
|
2616
2637
|
for (const pA of sections[i].paragraphs) {
|
|
2617
2638
|
if (found) break;
|
|
2618
2639
|
for (const pB of sections[j].paragraphs) {
|
|
2619
|
-
const sim =
|
|
2640
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2620
2641
|
if (sim > 0.4) {
|
|
2621
2642
|
pairs.push({
|
|
2622
2643
|
headingA: sections[i].heading,
|
|
@@ -2696,11 +2717,11 @@ function checkCrossPageDuplication(data) {
|
|
|
2696
2717
|
const findings = [];
|
|
2697
2718
|
const pages = [];
|
|
2698
2719
|
if (data.homepage) {
|
|
2699
|
-
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs:
|
|
2720
|
+
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
|
|
2700
2721
|
}
|
|
2701
2722
|
if (data.blogSample) {
|
|
2702
2723
|
for (const page of data.blogSample) {
|
|
2703
|
-
pages.push({ url: page.finalUrl || "", paragraphs:
|
|
2724
|
+
pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
|
|
2704
2725
|
}
|
|
2705
2726
|
}
|
|
2706
2727
|
if (pages.length <= 1) {
|
|
@@ -2732,7 +2753,7 @@ function checkCrossPageDuplication(data) {
|
|
|
2732
2753
|
const fpA = [...pA.shingles].slice(0, 5).join("|");
|
|
2733
2754
|
if (siteBoilerprints.has(fpA)) continue;
|
|
2734
2755
|
for (const pB of pages[j].paragraphs) {
|
|
2735
|
-
const sim =
|
|
2756
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2736
2757
|
if (sim > 0.4) {
|
|
2737
2758
|
dupCount++;
|
|
2738
2759
|
if (!sample) sample = pA.text.slice(0, 80);
|
|
@@ -4458,18 +4479,11 @@ function scoreImageContextAI(html) {
|
|
|
4458
4479
|
if (contextualImages.length > 0) score += 3;
|
|
4459
4480
|
return cap(score, 10);
|
|
4460
4481
|
}
|
|
4461
|
-
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
4462
|
-
function isBoilerplate(text) {
|
|
4463
|
-
const words = text.split(/\s+/).length;
|
|
4464
|
-
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
4465
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
4466
|
-
return false;
|
|
4467
|
-
}
|
|
4468
4482
|
function scoreDuplicateContent(html) {
|
|
4469
4483
|
return scoreDuplicateContentDetailed(html).score;
|
|
4470
4484
|
}
|
|
4471
4485
|
function scoreDuplicateContentDetailed(html) {
|
|
4472
|
-
const sections =
|
|
4486
|
+
const sections = extractDuplicateContentSections(html);
|
|
4473
4487
|
if (sections.length < 2) return { score: 10, duplicates: [] };
|
|
4474
4488
|
const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
|
|
4475
4489
|
const duplicates = [];
|
|
@@ -4478,7 +4492,7 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4478
4492
|
for (let j = i + 1; j < sections.length; j++) {
|
|
4479
4493
|
for (const pA of sections[i].paragraphs) {
|
|
4480
4494
|
for (const pB of sections[j].paragraphs) {
|
|
4481
|
-
const sim =
|
|
4495
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
4482
4496
|
if (sim > 0.4) {
|
|
4483
4497
|
dupParagraphCount++;
|
|
4484
4498
|
duplicates.push({
|
|
@@ -4507,41 +4521,6 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4507
4521
|
}
|
|
4508
4522
|
return { score, duplicates };
|
|
4509
4523
|
}
|
|
4510
|
-
function extractSectionsWithParagraphs(html) {
|
|
4511
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
4512
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
4513
|
-
const sections = [];
|
|
4514
|
-
for (const part of parts) {
|
|
4515
|
-
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
4516
|
-
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
4517
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
4518
|
-
const paragraphs = pMatches.map((p) => {
|
|
4519
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
4520
|
-
return { text, shingles: buildShingles(text, 4) };
|
|
4521
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
|
|
4522
|
-
if (paragraphs.length > 0) {
|
|
4523
|
-
sections.push({ heading, paragraphs });
|
|
4524
|
-
}
|
|
4525
|
-
}
|
|
4526
|
-
return sections;
|
|
4527
|
-
}
|
|
4528
|
-
function buildShingles(text, n) {
|
|
4529
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
4530
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
4531
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
4532
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
4533
|
-
}
|
|
4534
|
-
return shingles;
|
|
4535
|
-
}
|
|
4536
|
-
function shingleJaccard(a, b) {
|
|
4537
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
4538
|
-
let intersection = 0;
|
|
4539
|
-
for (const s of a) {
|
|
4540
|
-
if (b.has(s)) intersection++;
|
|
4541
|
-
}
|
|
4542
|
-
const union = a.size + b.size - intersection;
|
|
4543
|
-
return union === 0 ? 0 : intersection / union;
|
|
4544
|
-
}
|
|
4545
4524
|
var SCORING_FUNCTIONS = {
|
|
4546
4525
|
schema_markup: scoreSchemaMarkup,
|
|
4547
4526
|
qa_content_format: scoreQAFormat,
|