aeorank 3.1.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -462,6 +462,75 @@ function detectParkedDomain(bodySnippet) {
462
462
  return { isParked: false };
463
463
  }
464
464
 
465
+ // src/duplicate-content.ts
466
+ var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
467
+ var MIN_SUBSTANTIVE_WORDS = 15;
468
+ var MAX_METADATA_WORDS = 24;
469
+ var MAX_METADATA_LABEL_WORDS = 4;
470
+ function normalizeParagraphText(htmlFragment) {
471
+ return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
472
+ }
473
+ function tokenize(text) {
474
+ return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
475
+ }
476
+ function isBoilerplateParagraph(text, words) {
477
+ if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
478
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
479
+ return false;
480
+ }
481
+ function isMetadataParagraph(text, words) {
482
+ const labelMatch = text.match(/^([^:]{1,60}):\s+/);
483
+ if (!labelMatch) return false;
484
+ const labelWords = tokenize(labelMatch[1]).length;
485
+ return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
486
+ }
487
+ function buildShinglesFromTokens(words, n = 4) {
488
+ const shingles = /* @__PURE__ */ new Set();
489
+ for (let i = 0; i <= words.length - n; i++) {
490
+ shingles.add(words.slice(i, i + n).join(" "));
491
+ }
492
+ return shingles;
493
+ }
494
+ function createParagraph(htmlFragment) {
495
+ const text = normalizeParagraphText(htmlFragment);
496
+ const words = tokenize(text);
497
+ if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
498
+ if (isBoilerplateParagraph(text, words.length)) return null;
499
+ if (isMetadataParagraph(text, words.length)) return null;
500
+ const shingles = buildShinglesFromTokens(words);
501
+ if (shingles.size < 3) return null;
502
+ return { text, shingles };
503
+ }
504
+ function stripNonContentHtml(html) {
505
+ return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
506
+ }
507
+ function extractDuplicateContentParagraphs(html) {
508
+ const cleaned = stripNonContentHtml(html);
509
+ const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
510
+ return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
511
+ }
512
+ function extractDuplicateContentSections(html) {
513
+ const cleaned = stripNonContentHtml(html);
514
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
515
+ const sections = [];
516
+ for (const part of parts) {
517
+ const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
518
+ const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
519
+ const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
520
+ if (paragraphs.length > 0) sections.push({ heading, paragraphs });
521
+ }
522
+ return sections;
523
+ }
524
+ function shingleJaccardSimilarity(a, b) {
525
+ if (a.size === 0 && b.size === 0) return 0;
526
+ let intersection = 0;
527
+ for (const shingle of a) {
528
+ if (b.has(shingle)) intersection++;
529
+ }
530
+ const union = a.size + b.size - intersection;
531
+ return union === 0 ? 0 : intersection / union;
532
+ }
533
+
465
534
  // src/site-crawler.ts
466
535
  async function fetchText(url) {
467
536
  try {
@@ -2942,56 +3011,8 @@ function checkImageContextAI(data) {
2942
3011
  }
2943
3012
  return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
2944
3013
  }
2945
- var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
2946
- function isBoilerplateParagraph(text) {
2947
- const words = text.split(/\s+/).length;
2948
- if (words < 20 && BOILERPLATE_RE.test(text)) return true;
2949
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
2950
- return false;
2951
- }
2952
- function toShingles(text, n = 4) {
2953
- const words = text.split(/\s+/).filter((w) => w.length > 1);
2954
- const shingles = /* @__PURE__ */ new Set();
2955
- for (let i = 0; i <= words.length - n; i++) {
2956
- shingles.add(words.slice(i, i + n).join(" "));
2957
- }
2958
- return shingles;
2959
- }
2960
- function shingleSimilarity(a, b) {
2961
- if (a.size === 0 && b.size === 0) return 0;
2962
- let intersection = 0;
2963
- for (const s of a) {
2964
- if (b.has(s)) intersection++;
2965
- }
2966
- const union = a.size + b.size - intersection;
2967
- return union === 0 ? 0 : intersection / union;
2968
- }
2969
- function extractPageParagraphs(html) {
2970
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2971
- const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2972
- return pMatches.map((p) => {
2973
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2974
- return { text, shingles: toShingles(text) };
2975
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2976
- }
2977
- function splitIntoSectionsWithParagraphs(html) {
2978
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2979
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
2980
- const sections = [];
2981
- for (const part of parts) {
2982
- const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
2983
- const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
2984
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2985
- const paragraphs = pMatches.map((p) => {
2986
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2987
- return { text, shingles: toShingles(text) };
2988
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2989
- if (paragraphs.length > 0) sections.push({ heading, paragraphs });
2990
- }
2991
- return sections;
2992
- }
2993
3014
  function findIntraPageDuplicates(html) {
2994
- const sections = splitIntoSectionsWithParagraphs(html);
3015
+ const sections = extractDuplicateContentSections(html);
2995
3016
  if (sections.length < 2) return [];
2996
3017
  const pairs = [];
2997
3018
  for (let i = 0; i < sections.length; i++) {
@@ -3000,7 +3021,7 @@ function findIntraPageDuplicates(html) {
3000
3021
  for (const pA of sections[i].paragraphs) {
3001
3022
  if (found) break;
3002
3023
  for (const pB of sections[j].paragraphs) {
3003
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
3024
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
3004
3025
  if (sim > 0.4) {
3005
3026
  pairs.push({
3006
3027
  headingA: sections[i].heading,
@@ -3080,11 +3101,11 @@ function checkCrossPageDuplication(data) {
3080
3101
  const findings = [];
3081
3102
  const pages = [];
3082
3103
  if (data.homepage) {
3083
- pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractPageParagraphs(data.homepage.text) });
3104
+ pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
3084
3105
  }
3085
3106
  if (data.blogSample) {
3086
3107
  for (const page of data.blogSample) {
3087
- pages.push({ url: page.finalUrl || "", paragraphs: extractPageParagraphs(page.text) });
3108
+ pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
3088
3109
  }
3089
3110
  }
3090
3111
  if (pages.length <= 1) {
@@ -3116,7 +3137,7 @@ function checkCrossPageDuplication(data) {
3116
3137
  const fpA = [...pA.shingles].slice(0, 5).join("|");
3117
3138
  if (siteBoilerprints.has(fpA)) continue;
3118
3139
  for (const pB of pages[j].paragraphs) {
3119
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
3140
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
3120
3141
  if (sim > 0.4) {
3121
3142
  dupCount++;
3122
3143
  if (!sample) sample = pA.text.slice(0, 80);
@@ -4842,18 +4863,11 @@ function scoreImageContextAI(html) {
4842
4863
  if (contextualImages.length > 0) score += 3;
4843
4864
  return cap(score, 10);
4844
4865
  }
4845
- var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
4846
- function isBoilerplate(text) {
4847
- const words = text.split(/\s+/).length;
4848
- if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
4849
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
4850
- return false;
4851
- }
4852
4866
  function scoreDuplicateContent(html) {
4853
4867
  return scoreDuplicateContentDetailed(html).score;
4854
4868
  }
4855
4869
  function scoreDuplicateContentDetailed(html) {
4856
- const sections = extractSectionsWithParagraphs(html);
4870
+ const sections = extractDuplicateContentSections(html);
4857
4871
  if (sections.length < 2) return { score: 10, duplicates: [] };
4858
4872
  const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
4859
4873
  const duplicates = [];
@@ -4862,7 +4876,7 @@ function scoreDuplicateContentDetailed(html) {
4862
4876
  for (let j = i + 1; j < sections.length; j++) {
4863
4877
  for (const pA of sections[i].paragraphs) {
4864
4878
  for (const pB of sections[j].paragraphs) {
4865
- const sim = shingleJaccard(pA.shingles, pB.shingles);
4879
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
4866
4880
  if (sim > 0.4) {
4867
4881
  dupParagraphCount++;
4868
4882
  duplicates.push({
@@ -4891,41 +4905,6 @@ function scoreDuplicateContentDetailed(html) {
4891
4905
  }
4892
4906
  return { score, duplicates };
4893
4907
  }
4894
- function extractSectionsWithParagraphs(html) {
4895
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
4896
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
4897
- const sections = [];
4898
- for (const part of parts) {
4899
- const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
4900
- const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
4901
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
4902
- const paragraphs = pMatches.map((p) => {
4903
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
4904
- return { text, shingles: buildShingles(text, 4) };
4905
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
4906
- if (paragraphs.length > 0) {
4907
- sections.push({ heading, paragraphs });
4908
- }
4909
- }
4910
- return sections;
4911
- }
4912
- function buildShingles(text, n) {
4913
- const words = text.split(/\s+/).filter((w) => w.length > 1);
4914
- const shingles = /* @__PURE__ */ new Set();
4915
- for (let i = 0; i <= words.length - n; i++) {
4916
- shingles.add(words.slice(i, i + n).join(" "));
4917
- }
4918
- return shingles;
4919
- }
4920
- function shingleJaccard(a, b) {
4921
- if (a.size === 0 && b.size === 0) return 0;
4922
- let intersection = 0;
4923
- for (const s of a) {
4924
- if (b.has(s)) intersection++;
4925
- }
4926
- const union = a.size + b.size - intersection;
4927
- return union === 0 ? 0 : intersection / union;
4928
- }
4929
4908
  var SCORING_FUNCTIONS = {
4930
4909
  schema_markup: scoreSchemaMarkup,
4931
4910
  qa_content_format: scoreQAFormat,