aeorank 3.1.0 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/browser.js +76 -97
- package/dist/browser.js.map +1 -1
- package/dist/cli.js +76 -97
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +76 -97
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +76 -97
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -76,6 +76,75 @@ function detectParkedDomain(bodySnippet) {
|
|
|
76
76
|
return { isParked: false };
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
+
// src/duplicate-content.ts
|
|
80
|
+
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
81
|
+
var MIN_SUBSTANTIVE_WORDS = 15;
|
|
82
|
+
var MAX_METADATA_WORDS = 24;
|
|
83
|
+
var MAX_METADATA_LABEL_WORDS = 4;
|
|
84
|
+
function normalizeParagraphText(htmlFragment) {
|
|
85
|
+
return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
86
|
+
}
|
|
87
|
+
function tokenize(text) {
|
|
88
|
+
return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
|
|
89
|
+
}
|
|
90
|
+
function isBoilerplateParagraph(text, words) {
|
|
91
|
+
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
92
|
+
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
function isMetadataParagraph(text, words) {
|
|
96
|
+
const labelMatch = text.match(/^([^:]{1,60}):\s+/);
|
|
97
|
+
if (!labelMatch) return false;
|
|
98
|
+
const labelWords = tokenize(labelMatch[1]).length;
|
|
99
|
+
return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
|
|
100
|
+
}
|
|
101
|
+
function buildShinglesFromTokens(words, n = 4) {
|
|
102
|
+
const shingles = /* @__PURE__ */ new Set();
|
|
103
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
104
|
+
shingles.add(words.slice(i, i + n).join(" "));
|
|
105
|
+
}
|
|
106
|
+
return shingles;
|
|
107
|
+
}
|
|
108
|
+
function createParagraph(htmlFragment) {
|
|
109
|
+
const text = normalizeParagraphText(htmlFragment);
|
|
110
|
+
const words = tokenize(text);
|
|
111
|
+
if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
|
|
112
|
+
if (isBoilerplateParagraph(text, words.length)) return null;
|
|
113
|
+
if (isMetadataParagraph(text, words.length)) return null;
|
|
114
|
+
const shingles = buildShinglesFromTokens(words);
|
|
115
|
+
if (shingles.size < 3) return null;
|
|
116
|
+
return { text, shingles };
|
|
117
|
+
}
|
|
118
|
+
function stripNonContentHtml(html) {
|
|
119
|
+
return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
120
|
+
}
|
|
121
|
+
function extractDuplicateContentParagraphs(html) {
|
|
122
|
+
const cleaned = stripNonContentHtml(html);
|
|
123
|
+
const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
124
|
+
return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
125
|
+
}
|
|
126
|
+
function extractDuplicateContentSections(html) {
|
|
127
|
+
const cleaned = stripNonContentHtml(html);
|
|
128
|
+
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
129
|
+
const sections = [];
|
|
130
|
+
for (const part of parts) {
|
|
131
|
+
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
132
|
+
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
133
|
+
const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
134
|
+
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
135
|
+
}
|
|
136
|
+
return sections;
|
|
137
|
+
}
|
|
138
|
+
function shingleJaccardSimilarity(a, b) {
|
|
139
|
+
if (a.size === 0 && b.size === 0) return 0;
|
|
140
|
+
let intersection = 0;
|
|
141
|
+
for (const shingle of a) {
|
|
142
|
+
if (b.has(shingle)) intersection++;
|
|
143
|
+
}
|
|
144
|
+
const union = a.size + b.size - intersection;
|
|
145
|
+
return union === 0 ? 0 : intersection / union;
|
|
146
|
+
}
|
|
147
|
+
|
|
79
148
|
// src/site-crawler.ts
|
|
80
149
|
async function fetchText(url) {
|
|
81
150
|
try {
|
|
@@ -2556,56 +2625,8 @@ function checkImageContextAI(data) {
|
|
|
2556
2625
|
}
|
|
2557
2626
|
return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
|
|
2558
2627
|
}
|
|
2559
|
-
var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
2560
|
-
function isBoilerplateParagraph(text) {
|
|
2561
|
-
const words = text.split(/\s+/).length;
|
|
2562
|
-
if (words < 20 && BOILERPLATE_RE.test(text)) return true;
|
|
2563
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
2564
|
-
return false;
|
|
2565
|
-
}
|
|
2566
|
-
function toShingles(text, n = 4) {
|
|
2567
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
2568
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
2569
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
2570
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
2571
|
-
}
|
|
2572
|
-
return shingles;
|
|
2573
|
-
}
|
|
2574
|
-
function shingleSimilarity(a, b) {
|
|
2575
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
2576
|
-
let intersection = 0;
|
|
2577
|
-
for (const s of a) {
|
|
2578
|
-
if (b.has(s)) intersection++;
|
|
2579
|
-
}
|
|
2580
|
-
const union = a.size + b.size - intersection;
|
|
2581
|
-
return union === 0 ? 0 : intersection / union;
|
|
2582
|
-
}
|
|
2583
|
-
function extractPageParagraphs(html) {
|
|
2584
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2585
|
-
const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2586
|
-
return pMatches.map((p) => {
|
|
2587
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2588
|
-
return { text, shingles: toShingles(text) };
|
|
2589
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2590
|
-
}
|
|
2591
|
-
function splitIntoSectionsWithParagraphs(html) {
|
|
2592
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2593
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
2594
|
-
const sections = [];
|
|
2595
|
-
for (const part of parts) {
|
|
2596
|
-
const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
2597
|
-
const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
2598
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2599
|
-
const paragraphs = pMatches.map((p) => {
|
|
2600
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2601
|
-
return { text, shingles: toShingles(text) };
|
|
2602
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2603
|
-
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
2604
|
-
}
|
|
2605
|
-
return sections;
|
|
2606
|
-
}
|
|
2607
2628
|
function findIntraPageDuplicates(html) {
|
|
2608
|
-
const sections =
|
|
2629
|
+
const sections = extractDuplicateContentSections(html);
|
|
2609
2630
|
if (sections.length < 2) return [];
|
|
2610
2631
|
const pairs = [];
|
|
2611
2632
|
for (let i = 0; i < sections.length; i++) {
|
|
@@ -2614,7 +2635,7 @@ function findIntraPageDuplicates(html) {
|
|
|
2614
2635
|
for (const pA of sections[i].paragraphs) {
|
|
2615
2636
|
if (found) break;
|
|
2616
2637
|
for (const pB of sections[j].paragraphs) {
|
|
2617
|
-
const sim =
|
|
2638
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2618
2639
|
if (sim > 0.4) {
|
|
2619
2640
|
pairs.push({
|
|
2620
2641
|
headingA: sections[i].heading,
|
|
@@ -2694,11 +2715,11 @@ function checkCrossPageDuplication(data) {
|
|
|
2694
2715
|
const findings = [];
|
|
2695
2716
|
const pages = [];
|
|
2696
2717
|
if (data.homepage) {
|
|
2697
|
-
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs:
|
|
2718
|
+
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
|
|
2698
2719
|
}
|
|
2699
2720
|
if (data.blogSample) {
|
|
2700
2721
|
for (const page of data.blogSample) {
|
|
2701
|
-
pages.push({ url: page.finalUrl || "", paragraphs:
|
|
2722
|
+
pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
|
|
2702
2723
|
}
|
|
2703
2724
|
}
|
|
2704
2725
|
if (pages.length <= 1) {
|
|
@@ -2730,7 +2751,7 @@ function checkCrossPageDuplication(data) {
|
|
|
2730
2751
|
const fpA = [...pA.shingles].slice(0, 5).join("|");
|
|
2731
2752
|
if (siteBoilerprints.has(fpA)) continue;
|
|
2732
2753
|
for (const pB of pages[j].paragraphs) {
|
|
2733
|
-
const sim =
|
|
2754
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
2734
2755
|
if (sim > 0.4) {
|
|
2735
2756
|
dupCount++;
|
|
2736
2757
|
if (!sample) sample = pA.text.slice(0, 80);
|
|
@@ -4433,18 +4454,11 @@ function scoreImageContextAI(html) {
|
|
|
4433
4454
|
if (contextualImages.length > 0) score += 3;
|
|
4434
4455
|
return cap(score, 10);
|
|
4435
4456
|
}
|
|
4436
|
-
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
4437
|
-
function isBoilerplate(text) {
|
|
4438
|
-
const words = text.split(/\s+/).length;
|
|
4439
|
-
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
4440
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
4441
|
-
return false;
|
|
4442
|
-
}
|
|
4443
4457
|
function scoreDuplicateContent(html) {
|
|
4444
4458
|
return scoreDuplicateContentDetailed(html).score;
|
|
4445
4459
|
}
|
|
4446
4460
|
function scoreDuplicateContentDetailed(html) {
|
|
4447
|
-
const sections =
|
|
4461
|
+
const sections = extractDuplicateContentSections(html);
|
|
4448
4462
|
if (sections.length < 2) return { score: 10, duplicates: [] };
|
|
4449
4463
|
const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
|
|
4450
4464
|
const duplicates = [];
|
|
@@ -4453,7 +4467,7 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4453
4467
|
for (let j = i + 1; j < sections.length; j++) {
|
|
4454
4468
|
for (const pA of sections[i].paragraphs) {
|
|
4455
4469
|
for (const pB of sections[j].paragraphs) {
|
|
4456
|
-
const sim =
|
|
4470
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
4457
4471
|
if (sim > 0.4) {
|
|
4458
4472
|
dupParagraphCount++;
|
|
4459
4473
|
duplicates.push({
|
|
@@ -4482,41 +4496,6 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4482
4496
|
}
|
|
4483
4497
|
return { score, duplicates };
|
|
4484
4498
|
}
|
|
4485
|
-
function extractSectionsWithParagraphs(html) {
|
|
4486
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
4487
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
4488
|
-
const sections = [];
|
|
4489
|
-
for (const part of parts) {
|
|
4490
|
-
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
4491
|
-
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
4492
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
4493
|
-
const paragraphs = pMatches.map((p) => {
|
|
4494
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
4495
|
-
return { text, shingles: buildShingles(text, 4) };
|
|
4496
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
|
|
4497
|
-
if (paragraphs.length > 0) {
|
|
4498
|
-
sections.push({ heading, paragraphs });
|
|
4499
|
-
}
|
|
4500
|
-
}
|
|
4501
|
-
return sections;
|
|
4502
|
-
}
|
|
4503
|
-
function buildShingles(text, n) {
|
|
4504
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
4505
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
4506
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
4507
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
4508
|
-
}
|
|
4509
|
-
return shingles;
|
|
4510
|
-
}
|
|
4511
|
-
function shingleJaccard(a, b) {
|
|
4512
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
4513
|
-
let intersection = 0;
|
|
4514
|
-
for (const s of a) {
|
|
4515
|
-
if (b.has(s)) intersection++;
|
|
4516
|
-
}
|
|
4517
|
-
const union = a.size + b.size - intersection;
|
|
4518
|
-
return union === 0 ? 0 : intersection / union;
|
|
4519
|
-
}
|
|
4520
4499
|
var SCORING_FUNCTIONS = {
|
|
4521
4500
|
schema_markup: scoreSchemaMarkup,
|
|
4522
4501
|
qa_content_format: scoreQAFormat,
|