aeorank 3.1.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -76,6 +76,75 @@ function detectParkedDomain(bodySnippet) {
76
76
  return { isParked: false };
77
77
  }
78
78
 
79
+ // src/duplicate-content.ts
80
+ var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
81
+ var MIN_SUBSTANTIVE_WORDS = 15;
82
+ var MAX_METADATA_WORDS = 24;
83
+ var MAX_METADATA_LABEL_WORDS = 4;
84
+ function normalizeParagraphText(htmlFragment) {
85
+ return htmlFragment.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
86
+ }
87
+ function tokenize(text) {
88
+ return text.split(/\s+/).map((word) => word.replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, "")).filter((word) => word.length > 0);
89
+ }
90
+ function isBoilerplateParagraph(text, words) {
91
+ if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
92
+ if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
93
+ return false;
94
+ }
95
+ function isMetadataParagraph(text, words) {
96
+ const labelMatch = text.match(/^([^:]{1,60}):\s+/);
97
+ if (!labelMatch) return false;
98
+ const labelWords = tokenize(labelMatch[1]).length;
99
+ return labelWords > 0 && labelWords <= MAX_METADATA_LABEL_WORDS && words <= MAX_METADATA_WORDS;
100
+ }
101
+ function buildShinglesFromTokens(words, n = 4) {
102
+ const shingles = /* @__PURE__ */ new Set();
103
+ for (let i = 0; i <= words.length - n; i++) {
104
+ shingles.add(words.slice(i, i + n).join(" "));
105
+ }
106
+ return shingles;
107
+ }
108
+ function createParagraph(htmlFragment) {
109
+ const text = normalizeParagraphText(htmlFragment);
110
+ const words = tokenize(text);
111
+ if (words.length < MIN_SUBSTANTIVE_WORDS) return null;
112
+ if (isBoilerplateParagraph(text, words.length)) return null;
113
+ if (isMetadataParagraph(text, words.length)) return null;
114
+ const shingles = buildShinglesFromTokens(words);
115
+ if (shingles.size < 3) return null;
116
+ return { text, shingles };
117
+ }
118
+ function stripNonContentHtml(html) {
119
+ return html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
120
+ }
121
+ function extractDuplicateContentParagraphs(html) {
122
+ const cleaned = stripNonContentHtml(html);
123
+ const matches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
124
+ return matches.map(createParagraph).filter((paragraph) => paragraph !== null);
125
+ }
126
+ function extractDuplicateContentSections(html) {
127
+ const cleaned = stripNonContentHtml(html);
128
+ const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
129
+ const sections = [];
130
+ for (const part of parts) {
131
+ const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
132
+ const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
133
+ const paragraphs = (part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || []).map(createParagraph).filter((paragraph) => paragraph !== null);
134
+ if (paragraphs.length > 0) sections.push({ heading, paragraphs });
135
+ }
136
+ return sections;
137
+ }
138
+ function shingleJaccardSimilarity(a, b) {
139
+ if (a.size === 0 && b.size === 0) return 0;
140
+ let intersection = 0;
141
+ for (const shingle of a) {
142
+ if (b.has(shingle)) intersection++;
143
+ }
144
+ const union = a.size + b.size - intersection;
145
+ return union === 0 ? 0 : intersection / union;
146
+ }
147
+
79
148
  // src/site-crawler.ts
80
149
  async function fetchText(url) {
81
150
  try {
@@ -2556,56 +2625,8 @@ function checkImageContextAI(data) {
2556
2625
  }
2557
2626
  return { criterion: "image_context_ai", criterion_label: "Image Context for AI", score: Math.min(10, score), status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P2" };
2558
2627
  }
2559
- var BOILERPLATE_RE = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
2560
- function isBoilerplateParagraph(text) {
2561
- const words = text.split(/\s+/).length;
2562
- if (words < 20 && BOILERPLATE_RE.test(text)) return true;
2563
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
2564
- return false;
2565
- }
2566
- function toShingles(text, n = 4) {
2567
- const words = text.split(/\s+/).filter((w) => w.length > 1);
2568
- const shingles = /* @__PURE__ */ new Set();
2569
- for (let i = 0; i <= words.length - n; i++) {
2570
- shingles.add(words.slice(i, i + n).join(" "));
2571
- }
2572
- return shingles;
2573
- }
2574
- function shingleSimilarity(a, b) {
2575
- if (a.size === 0 && b.size === 0) return 0;
2576
- let intersection = 0;
2577
- for (const s of a) {
2578
- if (b.has(s)) intersection++;
2579
- }
2580
- const union = a.size + b.size - intersection;
2581
- return union === 0 ? 0 : intersection / union;
2582
- }
2583
- function extractPageParagraphs(html) {
2584
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2585
- const pMatches = cleaned.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2586
- return pMatches.map((p) => {
2587
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2588
- return { text, shingles: toShingles(text) };
2589
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2590
- }
2591
- function splitIntoSectionsWithParagraphs(html) {
2592
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
2593
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
2594
- const sections = [];
2595
- for (const part of parts) {
2596
- const hMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
2597
- const heading = hMatch ? hMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
2598
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
2599
- const paragraphs = pMatches.map((p) => {
2600
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
2601
- return { text, shingles: toShingles(text) };
2602
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplateParagraph(p.text));
2603
- if (paragraphs.length > 0) sections.push({ heading, paragraphs });
2604
- }
2605
- return sections;
2606
- }
2607
2628
  function findIntraPageDuplicates(html) {
2608
- const sections = splitIntoSectionsWithParagraphs(html);
2629
+ const sections = extractDuplicateContentSections(html);
2609
2630
  if (sections.length < 2) return [];
2610
2631
  const pairs = [];
2611
2632
  for (let i = 0; i < sections.length; i++) {
@@ -2614,7 +2635,7 @@ function findIntraPageDuplicates(html) {
2614
2635
  for (const pA of sections[i].paragraphs) {
2615
2636
  if (found) break;
2616
2637
  for (const pB of sections[j].paragraphs) {
2617
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
2638
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2618
2639
  if (sim > 0.4) {
2619
2640
  pairs.push({
2620
2641
  headingA: sections[i].heading,
@@ -2694,11 +2715,11 @@ function checkCrossPageDuplication(data) {
2694
2715
  const findings = [];
2695
2716
  const pages = [];
2696
2717
  if (data.homepage) {
2697
- pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractPageParagraphs(data.homepage.text) });
2718
+ pages.push({ url: data.homepage.finalUrl || `https://${data.domain}/`, paragraphs: extractDuplicateContentParagraphs(data.homepage.text) });
2698
2719
  }
2699
2720
  if (data.blogSample) {
2700
2721
  for (const page of data.blogSample) {
2701
- pages.push({ url: page.finalUrl || "", paragraphs: extractPageParagraphs(page.text) });
2722
+ pages.push({ url: page.finalUrl || "", paragraphs: extractDuplicateContentParagraphs(page.text) });
2702
2723
  }
2703
2724
  }
2704
2725
  if (pages.length <= 1) {
@@ -2730,7 +2751,7 @@ function checkCrossPageDuplication(data) {
2730
2751
  const fpA = [...pA.shingles].slice(0, 5).join("|");
2731
2752
  if (siteBoilerprints.has(fpA)) continue;
2732
2753
  for (const pB of pages[j].paragraphs) {
2733
- const sim = shingleSimilarity(pA.shingles, pB.shingles);
2754
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
2734
2755
  if (sim > 0.4) {
2735
2756
  dupCount++;
2736
2757
  if (!sample) sample = pA.text.slice(0, 80);
@@ -4433,18 +4454,11 @@ function scoreImageContextAI(html) {
4433
4454
  if (contextualImages.length > 0) score += 3;
4434
4455
  return cap(score, 10);
4435
4456
  }
4436
- var BOILERPLATE_PATTERNS = /\b(sign up|subscribe|get started|contact us|request a demo|free trial|book a call|schedule a|learn more|click here|follow us|share this|copyright|all rights reserved|privacy policy|terms of service)\b/i;
4437
- function isBoilerplate(text) {
4438
- const words = text.split(/\s+/).length;
4439
- if (words < 20 && BOILERPLATE_PATTERNS.test(text)) return true;
4440
- if (/\b(cookie|gdpr|consent|opt.out)\b/i.test(text) && words < 30) return true;
4441
- return false;
4442
- }
4443
4457
  function scoreDuplicateContent(html) {
4444
4458
  return scoreDuplicateContentDetailed(html).score;
4445
4459
  }
4446
4460
  function scoreDuplicateContentDetailed(html) {
4447
- const sections = extractSectionsWithParagraphs(html);
4461
+ const sections = extractDuplicateContentSections(html);
4448
4462
  if (sections.length < 2) return { score: 10, duplicates: [] };
4449
4463
  const totalParagraphs = sections.reduce((sum, s) => sum + s.paragraphs.length, 0);
4450
4464
  const duplicates = [];
@@ -4453,7 +4467,7 @@ function scoreDuplicateContentDetailed(html) {
4453
4467
  for (let j = i + 1; j < sections.length; j++) {
4454
4468
  for (const pA of sections[i].paragraphs) {
4455
4469
  for (const pB of sections[j].paragraphs) {
4456
- const sim = shingleJaccard(pA.shingles, pB.shingles);
4470
+ const sim = shingleJaccardSimilarity(pA.shingles, pB.shingles);
4457
4471
  if (sim > 0.4) {
4458
4472
  dupParagraphCount++;
4459
4473
  duplicates.push({
@@ -4482,41 +4496,6 @@ function scoreDuplicateContentDetailed(html) {
4482
4496
  }
4483
4497
  return { score, duplicates };
4484
4498
  }
4485
- function extractSectionsWithParagraphs(html) {
4486
- const cleaned = html.replace(/<(script|style|nav|header|footer|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "").replace(/<aside\b[^>]*>[\s\S]*?<\/aside>/gi, "");
4487
- const parts = cleaned.split(/(?=<h[23]\b[^>]*>)/i);
4488
- const sections = [];
4489
- for (const part of parts) {
4490
- const headingMatch = part.match(/<h[23]\b[^>]*>([\s\S]*?)<\/h[23]>/i);
4491
- const heading = headingMatch ? headingMatch[1].replace(/<[^>]*>/g, "").trim() : "(intro)";
4492
- const pMatches = part.match(/<p\b[^>]*>([\s\S]*?)<\/p>/gi) || [];
4493
- const paragraphs = pMatches.map((p) => {
4494
- const text = p.replace(/<[^>]*>/g, " ").replace(/&\w+;/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
4495
- return { text, shingles: buildShingles(text, 4) };
4496
- }).filter((p) => p.shingles.size >= 3 && !isBoilerplate(p.text));
4497
- if (paragraphs.length > 0) {
4498
- sections.push({ heading, paragraphs });
4499
- }
4500
- }
4501
- return sections;
4502
- }
4503
- function buildShingles(text, n) {
4504
- const words = text.split(/\s+/).filter((w) => w.length > 1);
4505
- const shingles = /* @__PURE__ */ new Set();
4506
- for (let i = 0; i <= words.length - n; i++) {
4507
- shingles.add(words.slice(i, i + n).join(" "));
4508
- }
4509
- return shingles;
4510
- }
4511
- function shingleJaccard(a, b) {
4512
- if (a.size === 0 && b.size === 0) return 0;
4513
- let intersection = 0;
4514
- for (const s of a) {
4515
- if (b.has(s)) intersection++;
4516
- }
4517
- const union = a.size + b.size - intersection;
4518
- return union === 0 ? 0 : intersection / union;
4519
- }
4520
4499
  var SCORING_FUNCTIONS = {
4521
4500
  schema_markup: scoreSchemaMarkup,
4522
4501
  qa_content_format: scoreQAFormat,