aeorank 3.1.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -78,6 +78,75 @@ function detectParkedDomain(bodySnippet) {
78
78
  return { isParked: false };
79
79
  }
80
80
 
81
+ // src/duplicate-content.ts
82
+ var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
83
+ var MIN_SUBSTANTIVE_WORDS = 15;
84
+ var MAX_METADATA_WORDS = 24;
85
+ var MAX_METADATA_LABEL_WORDS = 4;
86
+ function normalizeParagraphText(htmlFragment) {
87
+ return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
88
+ }
89
+ function tokenize(text) {
90
+ return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
91
+ }
92
+ function isBoilerplateParagraph(text, words) {
93
+ if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
94
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
95
+ return false;
96
+ }
97
+ function isMetadataParagraph(text, words) {
98
+ const labelMatch = text.match(/^([^:]{1,60}):\s+/);
99
+ if (!labelMatch) return false;
100
+ const labelWords = tokenize(labelMatch[1]).length;
101
+ return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
102
+ }
103
+ function buildShinglesFromTokens(words, n = 4) {
104
+ const shingles = /* @__PURE__ */ new Set();
105
+ for (let i = 0; i <= words.length - n; i++) {
106
+ shingles.add(words.slice(i, i + n).join(" "));
107
+ }
108
+ return shingles;
109
+ }
110
+ function createParagraph(htmlFragment) {
111
+ const text = normalizeParagraphText(htmlFragment);
112
+ const words = tokenize(text);
113
+ if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
114
+ if (isBoilerplateParagraph(text, words.length)) return null;
115
+ if (isMetadataParagraph(text, words.length)) return null;
116
+ const shingles = buildShinglesFromTokens(words);
117
+ if (shingles.size < 3) return null;
118
+ return { text, shingles };
119
+ }
120
+ function stripNonContentHtml(html) {
121
+ return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
122
+ }
123
+ function extractDuplicateContentParagraphs(html) {
124
+ const cleaned = stripNonContentHtml(html);
125
+ const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
126
+ return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
127
+ }
128
+ function extractDuplicateContentSections(html) {
129
+ const cleaned = stripNonContentHtml(html);
130
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
131
+ const sections = [];
132
+ for (const part of parts) {
133
+ const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
134
+ const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
135
+ const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
136
+ if (paragraphs.length > 0) sections.push({ heading, paragraphs });
137
+ }
138
+ return sections;
139
+ }
140
+ function shingleJaccardSimilarity(a, b) {
141
+ if (a.size === 0 && b.size === 0) return 0;
142
+ let intersection = 0;
143
+ for (const shingle of a) {
144
+ if (b.has(shingle)) intersection++;
145
+ }
146
+ const union = a.size + b.size - intersection;
147
+ return union === 0 ? 0 : intersection / union;
148
+ }
149
+
81
150
  // src/site-crawler.ts
82
151
  async function fetchText(url) {
83
152
  try {
@@ -2558,56 +2627,8 @@ function checkImageContextAI(data) {
2558
2627
  }
2559
2628
  return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
2560
2629
  }
2561
- var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
2562
- function isBoilerplateParagraph(text) {
2563
- const words = text.split(/\s+/).length;
2564
- if (words < 20 && BOILERPLATE_RE.test(text)) return true;
2565
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
2566
- return false;
2567
- }
2568
- function toShingles(text, n = 4) {
2569
- const words = text.split(/\s+/).filter((w) => w.length > 1);
2570
- const shingles = /* @__PURE__ */ new Set();
2571
- for (let i = 0; i <= words.length - n; i++) {
2572
- shingles.add(words.slice(i, i + n).join(" "));
2573
- }
2574
- return shingles;
2575
- }
2576
- function shingleSimilarity(a, b) {
2577
- if (a.size === 0 && b.size === 0) return 0;
2578
- let intersection = 0;
2579
- for (const s of a) {
2580
- if (b.has(s)) intersection++;
2581
- }
2582
- const union = a.size + b.size - intersection;
2583
- return union === 0 ? 0 : intersection / union;
2584
- }
2585
- function extractPageParagraphs(html) {
2586
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2587
- const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2588
- return pMatches.map((p) => {
2589
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2590
- return { text, shingles: toShingles(text) };
2591
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2592
- }
2593
- function splitIntoSectionsWithParagraphs(html) {
2594
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2595
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
2596
- const sections = [];
2597
- for (const part of parts) {
2598
- const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
2599
- const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
2600
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2601
- const paragraphs = pMatches.map((p) => {
2602
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2603
- return { text, shingles: toShingles(text) };
2604
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2605
- if (paragraphs.length > 0) sections.push({ heading, paragraphs });
2606
- }
2607
- return sections;
2608
- }
2609
2630
  function findIntraPageDuplicates(html) {
2610
- const sections = splitIntoSectionsWithParagraphs(html);
2631
+ const sections = extractDuplicateContentSections(html);
2611
2632
  if (sections.length < 2) return [];
2612
2633
  const pairs = [];
2613
2634
  for (let i = 0; i < sections.length; i++) {
@@ -2616,7 +2637,7 @@ function findIntraPageDuplicates(html) {
2616
2637
  for (const pA of sections[i].paragraphs) {
2617
2638
  if (found) break;
2618
2639
  for (const pB of sections[j].paragraphs) {
2619
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
2640
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2620
2641
  if (sim > 0.4) {
2621
2642
  pairs.push({
2622
2643
  headingA: sections[i].heading,
@@ -2696,11 +2717,11 @@ function checkCrossPageDuplication(data) {
2696
2717
  const findings = [];
2697
2718
  const pages = [];
2698
2719
  if (data.homepage) {
2699
- pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractPageParagraphs(data.homepage.text) });
2720
+ pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
2700
2721
  }
2701
2722
  if (data.blogSample) {
2702
2723
  for (const page of data.blogSample) {
2703
- pages.push({ url: page.finalUrl || "", paragraphs: extractPageParagraphs(page.text) });
2724
+ pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
2704
2725
  }
2705
2726
  }
2706
2727
  if (pages.length <= 1) {
@@ -2732,7 +2753,7 @@ function checkCrossPageDuplication(data) {
2732
2753
  const fpA = [...pA.shingles].slice(0, 5).join("|");
2733
2754
  if (siteBoilerprints.has(fpA)) continue;
2734
2755
  for (const pB of pages[j].paragraphs) {
2735
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
2756
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2736
2757
  if (sim > 0.4) {
2737
2758
  dupCount++;
2738
2759
  if (!sample) sample = pA.text.slice(0, 80);
@@ -4458,18 +4479,11 @@ function scoreImageContextAI(html) {
4458
4479
  if (contextualImages.length > 0) score += 3;
4459
4480
  return cap(score, 10);
4460
4481
  }
4461
- var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
4462
- function isBoilerplate(text) {
4463
- const words = text.split(/\s+/).length;
4464
- if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
4465
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
4466
- return false;
4467
- }
4468
4482
  function scoreDuplicateContent(html) {
4469
4483
  return scoreDuplicateContentDetailed(html).score;
4470
4484
  }
4471
4485
  function scoreDuplicateContentDetailed(html) {
4472
- const sections = extractSectionsWithParagraphs(html);
4486
+ const sections = extractDuplicateContentSections(html);
4473
4487
  if (sections.length < 2) return { score: 10, duplicates: [] };
4474
4488
  const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
4475
4489
  const duplicates = [];
@@ -4478,7 +4492,7 @@ function scoreDuplicateContentDetailed(html) {
4478
4492
  for (let j = i + 1; j < sections.length; j++) {
4479
4493
  for (const pA of sections[i].paragraphs) {
4480
4494
  for (const pB of sections[j].paragraphs) {
4481
- const sim = shingleJaccard(pA.shingles, pB.shingles);
4495
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
4482
4496
  if (sim > 0.4) {
4483
4497
  dupParagraphCount++;
4484
4498
  duplicates.push({
@@ -4507,41 +4521,6 @@ function scoreDuplicateContentDetailed(html) {
4507
4521
  }
4508
4522
  return { score, duplicates };
4509
4523
  }
4510
- function extractSectionsWithParagraphs(html) {
4511
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
4512
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
4513
- const sections = [];
4514
- for (const part of parts) {
4515
- const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
4516
- const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
4517
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
4518
- const paragraphs = pMatches.map((p) => {
4519
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
4520
- return { text, shingles: buildShingles(text, 4) };
4521
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
4522
- if (paragraphs.length > 0) {
4523
- sections.push({ heading, paragraphs });
4524
- }
4525
- }
4526
- return sections;
4527
- }
4528
- function buildShingles(text, n) {
4529
- const words = text.split(/\s+/).filter((w) => w.length > 1);
4530
- const shingles = /* @__PURE__ */ new Set();
4531
- for (let i = 0; i <= words.length - n; i++) {
4532
- shingles.add(words.slice(i, i + n).join(" "));
4533
- }
4534
- return shingles;
4535
- }
4536
- function shingleJaccard(a, b) {
4537
- if (a.size === 0 && b.size === 0) return 0;
4538
- let intersection = 0;
4539
- for (const s of a) {
4540
- if (b.has(s)) intersection++;
4541
- }
4542
- const union = a.size + b.size - intersection;
4543
- return union === 0 ? 0 : intersection / union;
4544
- }
4545
4524
  var SCORING_FUNCTIONS = {
4546
4525
  schema_markup: scoreSchemaMarkup,
4547
4526
  qa_content_format: scoreQAFormat,