aeorank 3.1.0 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/browser.js +76 -97
- package/dist/browser.js.map +1 -1
- package/dist/cli.js +76 -97
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +76 -97
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +76 -97
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -462,6 +462,75 @@ function detectParkedDomain(bodySnippet) {
|
|
|
462
462
|
return { isParked: false };
|
|
463
463
|
}
|
|
464
464
|
|
|
465
|
+
// src/duplicate-content.ts
|
|
466
|
+
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
467
|
+
var MIN_SUBSTANTIVE_WORDS = 15;
|
|
468
|
+
var MAX_METADATA_WORDS = 24;
|
|
469
|
+
var MAX_METADATA_LABEL_WORDS = 4;
|
|
470
|
+
function normalizeParagraphText(htmlFragment) {
|
|
471
|
+
return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
472
|
+
}
|
|
473
|
+
function tokenize(text) {
|
|
474
|
+
return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
|
|
475
|
+
}
|
|
476
|
+
function isBoilerplateParagraph(text, words) {
|
|
477
|
+
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
478
|
+
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
479
|
+
return false;
|
|
480
|
+
}
|
|
481
|
+
function isMetadataParagraph(text, words) {
|
|
482
|
+
const labelMatch = text.match(/^([^:]{1,60}):\s+/);
|
|
483
|
+
if (!labelMatch) return false;
|
|
484
|
+
const labelWords = tokenize(labelMatch[1]).length;
|
|
485
|
+
return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
|
|
486
|
+
}
|
|
487
|
+
function buildShinglesFromTokens(words, n = 4) {
|
|
488
|
+
const shingles = /* @__PURE__ */ new Set();
|
|
489
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
490
|
+
shingles.add(words.slice(i, i + n).join(" "));
|
|
491
|
+
}
|
|
492
|
+
return shingles;
|
|
493
|
+
}
|
|
494
|
+
function createParagraph(htmlFragment) {
|
|
495
|
+
const text = normalizeParagraphText(htmlFragment);
|
|
496
|
+
const words = tokenize(text);
|
|
497
|
+
if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
|
|
498
|
+
if (isBoilerplateParagraph(text, words.length)) return null;
|
|
499
|
+
if (isMetadataParagraph(text, words.length)) return null;
|
|
500
|
+
const shingles = buildShinglesFromTokens(words);
|
|
501
|
+
if (shingles.size < 3) return null;
|
|
502
|
+
return { text, shingles };
|
|
503
|
+
}
|
|
504
|
+
function stripNonContentHtml(html) {
|
|
505
|
+
return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
506
|
+
}
|
|
507
|
+
function extractDuplicateContentParagraphs(html) {
|
|
508
|
+
const cleaned = stripNonContentHtml(html);
|
|
509
|
+
const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
510
|
+
return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
511
|
+
}
|
|
512
|
+
function extractDuplicateContentSections(html) {
|
|
513
|
+
const cleaned = stripNonContentHtml(html);
|
|
514
|
+
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
515
|
+
const sections = [];
|
|
516
|
+
for (const part of parts) {
|
|
517
|
+
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
518
|
+
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
519
|
+
const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
|
|
520
|
+
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
521
|
+
}
|
|
522
|
+
return sections;
|
|
523
|
+
}
|
|
524
|
+
function shingleJaccardSimilarity(a, b) {
|
|
525
|
+
if (a.size === 0 && b.size === 0) return 0;
|
|
526
|
+
let intersection = 0;
|
|
527
|
+
for (const shingle of a) {
|
|
528
|
+
if (b.has(shingle)) intersection++;
|
|
529
|
+
}
|
|
530
|
+
const union = a.size + b.size - intersection;
|
|
531
|
+
return union === 0 ? 0 : intersection / union;
|
|
532
|
+
}
|
|
533
|
+
|
|
465
534
|
// src/site-crawler.ts
|
|
466
535
|
async function fetchText(url) {
|
|
467
536
|
try {
|
|
@@ -2942,56 +3011,8 @@ function checkImageContextAI(data) {
|
|
|
2942
3011
|
}
|
|
2943
3012
|
return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
|
|
2944
3013
|
}
|
|
2945
|
-
var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
2946
|
-
function isBoilerplateParagraph(text) {
|
|
2947
|
-
const words = text.split(/\s+/).length;
|
|
2948
|
-
if (words < 20 && BOILERPLATE_RE.test(text)) return true;
|
|
2949
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
2950
|
-
return false;
|
|
2951
|
-
}
|
|
2952
|
-
function toShingles(text, n = 4) {
|
|
2953
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
2954
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
2955
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
2956
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
2957
|
-
}
|
|
2958
|
-
return shingles;
|
|
2959
|
-
}
|
|
2960
|
-
function shingleSimilarity(a, b) {
|
|
2961
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
2962
|
-
let intersection = 0;
|
|
2963
|
-
for (const s of a) {
|
|
2964
|
-
if (b.has(s)) intersection++;
|
|
2965
|
-
}
|
|
2966
|
-
const union = a.size + b.size - intersection;
|
|
2967
|
-
return union === 0 ? 0 : intersection / union;
|
|
2968
|
-
}
|
|
2969
|
-
function extractPageParagraphs(html) {
|
|
2970
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2971
|
-
const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2972
|
-
return pMatches.map((p) => {
|
|
2973
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2974
|
-
return { text, shingles: toShingles(text) };
|
|
2975
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2976
|
-
}
|
|
2977
|
-
function splitIntoSectionsWithParagraphs(html) {
|
|
2978
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
2979
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
2980
|
-
const sections = [];
|
|
2981
|
-
for (const part of parts) {
|
|
2982
|
-
const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
2983
|
-
const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
2984
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
2985
|
-
const paragraphs = pMatches.map((p) => {
|
|
2986
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
2987
|
-
return { text, shingles: toShingles(text) };
|
|
2988
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
|
|
2989
|
-
if (paragraphs.length > 0) sections.push({ heading, paragraphs });
|
|
2990
|
-
}
|
|
2991
|
-
return sections;
|
|
2992
|
-
}
|
|
2993
3014
|
function findIntraPageDuplicates(html) {
|
|
2994
|
-
const sections =
|
|
3015
|
+
const sections = extractDuplicateContentSections(html);
|
|
2995
3016
|
if (sections.length < 2) return [];
|
|
2996
3017
|
const pairs = [];
|
|
2997
3018
|
for (let i = 0; i < sections.length; i++) {
|
|
@@ -3000,7 +3021,7 @@ function findIntraPageDuplicates(html) {
|
|
|
3000
3021
|
for (const pA of sections[i].paragraphs) {
|
|
3001
3022
|
if (found) break;
|
|
3002
3023
|
for (const pB of sections[j].paragraphs) {
|
|
3003
|
-
const sim =
|
|
3024
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
3004
3025
|
if (sim > 0.4) {
|
|
3005
3026
|
pairs.push({
|
|
3006
3027
|
headingA: sections[i].heading,
|
|
@@ -3080,11 +3101,11 @@ function checkCrossPageDuplication(data) {
|
|
|
3080
3101
|
const findings = [];
|
|
3081
3102
|
const pages = [];
|
|
3082
3103
|
if (data.homepage) {
|
|
3083
|
-
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs:
|
|
3104
|
+
pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
|
|
3084
3105
|
}
|
|
3085
3106
|
if (data.blogSample) {
|
|
3086
3107
|
for (const page of data.blogSample) {
|
|
3087
|
-
pages.push({ url: page.finalUrl || "", paragraphs:
|
|
3108
|
+
pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
|
|
3088
3109
|
}
|
|
3089
3110
|
}
|
|
3090
3111
|
if (pages.length <= 1) {
|
|
@@ -3116,7 +3137,7 @@ function checkCrossPageDuplication(data) {
|
|
|
3116
3137
|
const fpA = [...pA.shingles].slice(0, 5).join("|");
|
|
3117
3138
|
if (siteBoilerprints.has(fpA)) continue;
|
|
3118
3139
|
for (const pB of pages[j].paragraphs) {
|
|
3119
|
-
const sim =
|
|
3140
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
3120
3141
|
if (sim > 0.4) {
|
|
3121
3142
|
dupCount++;
|
|
3122
3143
|
if (!sample) sample = pA.text.slice(0, 80);
|
|
@@ -4842,18 +4863,11 @@ function scoreImageContextAI(html) {
|
|
|
4842
4863
|
if (contextualImages.length > 0) score += 3;
|
|
4843
4864
|
return cap(score, 10);
|
|
4844
4865
|
}
|
|
4845
|
-
var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
|
|
4846
|
-
function isBoilerplate(text) {
|
|
4847
|
-
const words = text.split(/\s+/).length;
|
|
4848
|
-
if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
|
|
4849
|
-
if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
|
|
4850
|
-
return false;
|
|
4851
|
-
}
|
|
4852
4866
|
function scoreDuplicateContent(html) {
|
|
4853
4867
|
return scoreDuplicateContentDetailed(html).score;
|
|
4854
4868
|
}
|
|
4855
4869
|
function scoreDuplicateContentDetailed(html) {
|
|
4856
|
-
const sections =
|
|
4870
|
+
const sections = extractDuplicateContentSections(html);
|
|
4857
4871
|
if (sections.length < 2) return { score: 10, duplicates: [] };
|
|
4858
4872
|
const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
|
|
4859
4873
|
const duplicates = [];
|
|
@@ -4862,7 +4876,7 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4862
4876
|
for (let j = i + 1; j < sections.length; j++) {
|
|
4863
4877
|
for (const pA of sections[i].paragraphs) {
|
|
4864
4878
|
for (const pB of sections[j].paragraphs) {
|
|
4865
|
-
const sim =
|
|
4879
|
+
const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
|
|
4866
4880
|
if (sim > 0.4) {
|
|
4867
4881
|
dupParagraphCount++;
|
|
4868
4882
|
duplicates.push({
|
|
@@ -4891,41 +4905,6 @@ function scoreDuplicateContentDetailed(html) {
|
|
|
4891
4905
|
}
|
|
4892
4906
|
return { score, duplicates };
|
|
4893
4907
|
}
|
|
4894
|
-
function extractSectionsWithParagraphs(html) {
|
|
4895
|
-
const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
|
|
4896
|
-
const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
|
|
4897
|
-
const sections = [];
|
|
4898
|
-
for (const part of parts) {
|
|
4899
|
-
const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
|
|
4900
|
-
const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
|
|
4901
|
-
const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
4902
|
-
const paragraphs = pMatches.map((p) => {
|
|
4903
|
-
const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
4904
|
-
return { text, shingles: buildShingles(text, 4) };
|
|
4905
|
-
}).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
|
|
4906
|
-
if (paragraphs.length > 0) {
|
|
4907
|
-
sections.push({ heading, paragraphs });
|
|
4908
|
-
}
|
|
4909
|
-
}
|
|
4910
|
-
return sections;
|
|
4911
|
-
}
|
|
4912
|
-
function buildShingles(text, n) {
|
|
4913
|
-
const words = text.split(/\s+/).filter((w) => w.length > 1);
|
|
4914
|
-
const shingles = /* @__PURE__ */ new Set();
|
|
4915
|
-
for (let i = 0; i <= words.length - n; i++) {
|
|
4916
|
-
shingles.add(words.slice(i, i + n).join(" "));
|
|
4917
|
-
}
|
|
4918
|
-
return shingles;
|
|
4919
|
-
}
|
|
4920
|
-
function shingleJaccard(a, b) {
|
|
4921
|
-
if (a.size === 0 && b.size === 0) return 0;
|
|
4922
|
-
let intersection = 0;
|
|
4923
|
-
for (const s of a) {
|
|
4924
|
-
if (b.has(s)) intersection++;
|
|
4925
|
-
}
|
|
4926
|
-
const union = a.size + b.size - intersection;
|
|
4927
|
-
return union === 0 ? 0 : intersection / union;
|
|
4928
|
-
}
|
|
4929
4908
|
var SCORING_FUNCTIONS = {
|
|
4930
4909
|
schema_markup: scoreSchemaMarkup,
|
|
4931
4910
|
qa_content_format: scoreQAFormat,
|