npm - aeorank - Versions diffs - 3.0.0 → 3.0.2 - Mend

aeorank 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/browser.d.ts CHANGED Viewed

@@ -298,6 +298,12 @@ interface RawDataSummary {
     crawl_discovered: number;
     crawl_fetched: number;
     crawl_skipped: number;
+    citation_ready_sentences: number;
+    answer_first_ratio: number;
+    evidence_citations_avg: number;
+    entity_disambiguation_ratio: number;
+    extraction_friction_avg: number;
+    image_figure_ratio: number;
 }
 /**
  * Fetches all site data in parallel with HTTPS/HTTP fallback.

package/dist/browser.js CHANGED Viewed

@@ -1824,7 +1824,83 @@ function extractRawDataSummary(data) {
     // Full-crawl stats
     crawl_discovered: data.crawlStats?.discovered ?? 0,
     crawl_fetched: data.crawlStats?.fetched ?? 0,
-    crawl_skipped: data.crawlStats?.skipped ?? 0
+    crawl_skipped: data.crawlStats?.skipped ?? 0,
+    // V2 criteria fields
+    citation_ready_sentences: (() => {
+      const combinedText = text + " " + (data.blogSample?.map((p) => p.text.replace(/<[^>]*>/g, " ")).join(" ") || "");
+      return (combinedText.match(/\b\w+\s+(is\s+(?:a|an)\s|refers\s+to|defined\s+as)\b/gi) || []).length;
+    })(),
+    answer_first_ratio: (() => {
+      const pages = [html, ...data.blogSample?.map((p) => p.text) || []];
+      let answerFirst = 0;
+      for (const pageHtml of pages) {
+        const bodyMatch = pageHtml.match(/<body[^>]*>([\s\S]*)/i);
+        const bodyHtml = bodyMatch ? bodyMatch[1] : pageHtml;
+        const earlyParas = bodyHtml.match(/<p[^>]*>([\s\S]*?)<\/p>/gi)?.slice(0, 5) || [];
+        for (const p of earlyParas) {
+          const pText = p.replace(/<[^>]*>/g, "").trim();
+          const wc = pText.split(/\s+/).length;
+          if (wc >= 40 && wc <= 80) {
+            answerFirst++;
+            break;
+          }
+        }
+      }
+      return pages.length > 0 ? Math.round(answerFirst / pages.length * 100) : 0;
+    })(),
+    evidence_citations_avg: (() => {
+      const allHtml = html + "\n" + (data.blogSample?.map((p) => p.text).join("\n") || "");
+      const paragraphs = allHtml.match(/<p[^>]*>[\s\S]*?<\/p>/gi) || [];
+      let citations = 0;
+      const domainLower = data.domain.replace(/^www\./, "").toLowerCase();
+      for (const p of paragraphs) {
+        const links = p.match(/<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>/gi) || [];
+        for (const link of links) {
+          const href = link.match(/href=["'](https?:\/\/[^"']+)["']/i);
+          if (href) {
+            try {
+              const ld = new URL(href[1]).hostname.replace(/^www\./, "").toLowerCase();
+              if (ld !== domainLower) citations++;
+            } catch {
+            }
+          }
+        }
+      }
+      const pageCount = Math.max(1, 1 + (data.blogSample?.length ?? 0));
+      return Math.round(citations / pageCount * 10) / 10;
+    })(),
+    entity_disambiguation_ratio: (() => {
+      const pages = [html, ...data.blogSample?.map((p) => p.text) || []];
+      let defined = 0;
+      for (const pageHtml of pages) {
+        const h1Match = pageHtml.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
+        if (!h1Match) continue;
+        const h1Text = h1Match[1].replace(/<[^>]*>/g, "").trim();
+        const h1Words = h1Text.split(/\s+/).filter((w) => w.length > 3);
+        const primaryNoun = h1Words.sort((a, b) => b.length - a.length)[0] || "";
+        if (!primaryNoun) continue;
+        const pageText = pageHtml.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").slice(0, 500);
+        if (new RegExp(`\\b${primaryNoun.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b[^.]*\\b(is|refers|defined|means)\\b`, "i").test(pageText)) {
+          defined++;
+        }
+      }
+      return pages.length > 0 ? Math.round(defined / pages.length * 100) : 0;
+    })(),
+    extraction_friction_avg: (() => {
+      const combinedText = text + " " + (data.blogSample?.map((p) => p.text.replace(/<[^>]*>/g, " ")).join(" ") || "");
+      const sentences = combinedText.split(/[.!?]+/).filter((s) => s.trim().length > 5);
+      if (sentences.length === 0) return 0;
+      const totalWords = sentences.reduce((sum, s) => sum + s.trim().split(/\s+/).length, 0);
+      return Math.round(totalWords / sentences.length * 10) / 10;
+    })(),
+    image_figure_ratio: (() => {
+      const combinedHtml = html + "\n" + (data.blogSample?.map((p) => p.text).join("\n") || "");
+      const allImages = combinedHtml.match(/<img\s[^>]*>/gi) || [];
+      if (allImages.length === 0) return 0;
+      const figureBlocks = combinedHtml.match(/<figure[\s\S]*?<\/figure>/gi) || [];
+      const figuresWithCaption = figureBlocks.filter((f) => /<figcaption/i.test(f));
+      return Math.round(figuresWithCaption.length / allImages.length * 100);
+    })()
   };
 }
 function getPageTopicText(html) {
@@ -4364,8 +4440,8 @@ function extractLinksWithAnchors(html, sourceUrl, domain) {
     if (href.startsWith("//")) {
       fullUrl = `https:${href}`;
     } else if (href.startsWith("/")) {
-      if (href === "/" || href.startsWith("/#")) continue;
-      fullUrl = `https://${domain}${href}`;
+      if (href.startsWith("/#")) continue;
+      fullUrl = href === "/" ? `https://${domain}` : `https://${domain}${href}`;
     } else if (href.startsWith("http")) {
       fullUrl = href;
     } else if (href.startsWith("#") || href.startsWith("?") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
@@ -4379,7 +4455,7 @@ function extractLinksWithAnchors(html, sourceUrl, domain) {
       if (linkDomain !== cleanDomain) continue;
       parsed.hash = "";
       const path = parsed.pathname;
-      if (path === "/" || path === "") continue;
+      if (path === "") continue;
       if (RESOURCE_EXTENSIONS.test(path)) continue;
       if (SKIP_PATH_PATTERNS.test(path)) continue;
       const normalized = normalizeUrl(fullUrl);