aeorank 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/browser.d.ts CHANGED
@@ -298,6 +298,12 @@ interface RawDataSummary {
298
298
  crawl_discovered: number;
299
299
  crawl_fetched: number;
300
300
  crawl_skipped: number;
301
+ citation_ready_sentences: number;
302
+ answer_first_ratio: number;
303
+ evidence_citations_avg: number;
304
+ entity_disambiguation_ratio: number;
305
+ extraction_friction_avg: number;
306
+ image_figure_ratio: number;
301
307
  }
302
308
  /**
303
309
  * Fetches all site data in parallel with HTTPS/HTTP fallback.
package/dist/browser.js CHANGED
@@ -1824,7 +1824,83 @@ function extractRawDataSummary(data) {
1824
1824
  // Full-crawl stats
1825
1825
  crawl_discovered: data.crawlStats?.discovered ?? 0,
1826
1826
  crawl_fetched: data.crawlStats?.fetched ?? 0,
1827
- crawl_skipped: data.crawlStats?.skipped ?? 0
1827
+ crawl_skipped: data.crawlStats?.skipped ?? 0,
1828
+ // V2 criteria fields
1829
+ citation_ready_sentences: (() => {
1830
+ const combinedText = text + " " + (data.blogSample?.map((p) => p.text.replace(/<[^>]*>/g, " ")).join(" ") || "");
1831
+ return (combinedText.match(/\b\w+\s+(is\s+(?:a|an)\s|refers\s+to|defined\s+as)\b/gi) || []).length;
1832
+ })(),
1833
+ answer_first_ratio: (() => {
1834
+ const pages = [html, ...data.blogSample?.map((p) => p.text) || []];
1835
+ let answerFirst = 0;
1836
+ for (const pageHtml of pages) {
1837
+ const bodyMatch = pageHtml.match(/<body[^>]*>([\s\S]*)/i);
1838
+ const bodyHtml = bodyMatch ? bodyMatch[1] : pageHtml;
1839
+ const earlyParas = bodyHtml.match(/<p[^>]*>([\s\S]*?)<\/p>/gi)?.slice(0, 5) || [];
1840
+ for (const p of earlyParas) {
1841
+ const pText = p.replace(/<[^>]*>/g, "").trim();
1842
+ const wc = pText.split(/\s+/).length;
1843
+ if (wc >= 40 && wc <= 80) {
1844
+ answerFirst++;
1845
+ break;
1846
+ }
1847
+ }
1848
+ }
1849
+ return pages.length > 0 ? Math.round(answerFirst / pages.length * 100) : 0;
1850
+ })(),
1851
+ evidence_citations_avg: (() => {
1852
+ const allHtml = html + "\n" + (data.blogSample?.map((p) => p.text).join("\n") || "");
1853
+ const paragraphs = allHtml.match(/<p[^>]*>[\s\S]*?<\/p>/gi) || [];
1854
+ let citations = 0;
1855
+ const domainLower = data.domain.replace(/^www\./, "").toLowerCase();
1856
+ for (const p of paragraphs) {
1857
+ const links = p.match(/<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>/gi) || [];
1858
+ for (const link of links) {
1859
+ const href = link.match(/href=["'](https?:\/\/[^"']+)["']/i);
1860
+ if (href) {
1861
+ try {
1862
+ const ld = new URL(href[1]).hostname.replace(/^www\./, "").toLowerCase();
1863
+ if (ld !== domainLower) citations++;
1864
+ } catch {
1865
+ }
1866
+ }
1867
+ }
1868
+ }
1869
+ const pageCount = Math.max(1, 1 + (data.blogSample?.length ?? 0));
1870
+ return Math.round(citations / pageCount * 10) / 10;
1871
+ })(),
1872
+ entity_disambiguation_ratio: (() => {
1873
+ const pages = [html, ...data.blogSample?.map((p) => p.text) || []];
1874
+ let defined = 0;
1875
+ for (const pageHtml of pages) {
1876
+ const h1Match = pageHtml.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
1877
+ if (!h1Match) continue;
1878
+ const h1Text = h1Match[1].replace(/<[^>]*>/g, "").trim();
1879
+ const h1Words = h1Text.split(/\s+/).filter((w) => w.length > 3);
1880
+ const primaryNoun = h1Words.sort((a, b) => b.length - a.length)[0] || "";
1881
+ if (!primaryNoun) continue;
1882
+ const pageText = pageHtml.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").slice(0, 500);
1883
+ if (new RegExp(`\\b${primaryNoun.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b[^.]*\\b(is|refers|defined|means)\\b`, "i").test(pageText)) {
1884
+ defined++;
1885
+ }
1886
+ }
1887
+ return pages.length > 0 ? Math.round(defined / pages.length * 100) : 0;
1888
+ })(),
1889
+ extraction_friction_avg: (() => {
1890
+ const combinedText = text + " " + (data.blogSample?.map((p) => p.text.replace(/<[^>]*>/g, " ")).join(" ") || "");
1891
+ const sentences = combinedText.split(/[.!?]+/).filter((s) => s.trim().length > 5);
1892
+ if (sentences.length === 0) return 0;
1893
+ const totalWords = sentences.reduce((sum, s) => sum + s.trim().split(/\s+/).length, 0);
1894
+ return Math.round(totalWords / sentences.length * 10) / 10;
1895
+ })(),
1896
+ image_figure_ratio: (() => {
1897
+ const combinedHtml = html + "\n" + (data.blogSample?.map((p) => p.text).join("\n") || "");
1898
+ const allImages = combinedHtml.match(/<img\s[^>]*>/gi) || [];
1899
+ if (allImages.length === 0) return 0;
1900
+ const figureBlocks = combinedHtml.match(/<figure[\s\S]*?<\/figure>/gi) || [];
1901
+ const figuresWithCaption = figureBlocks.filter((f) => /<figcaption/i.test(f));
1902
+ return Math.round(figuresWithCaption.length / allImages.length * 100);
1903
+ })()
1828
1904
  };
1829
1905
  }
1830
1906
  function getPageTopicText(html) {