aeorank 3.0.0 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.d.ts +6 -0
- package/dist/browser.js +80 -4
- package/dist/browser.js.map +1 -1
- package/dist/cli.js +149 -4
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +128 -6
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +128 -6
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/browser.d.ts
CHANGED
|
@@ -298,6 +298,12 @@ interface RawDataSummary {
|
|
|
298
298
|
crawl_discovered: number;
|
|
299
299
|
crawl_fetched: number;
|
|
300
300
|
crawl_skipped: number;
|
|
301
|
+
citation_ready_sentences: number;
|
|
302
|
+
answer_first_ratio: number;
|
|
303
|
+
evidence_citations_avg: number;
|
|
304
|
+
entity_disambiguation_ratio: number;
|
|
305
|
+
extraction_friction_avg: number;
|
|
306
|
+
image_figure_ratio: number;
|
|
301
307
|
}
|
|
302
308
|
/**
|
|
303
309
|
* Fetches all site data in parallel with HTTPS/HTTP fallback.
|
package/dist/browser.js
CHANGED
|
@@ -1824,7 +1824,83 @@ function extractRawDataSummary(data) {
|
|
|
1824
1824
|
// Full-crawl stats
|
|
1825
1825
|
crawl_discovered: data.crawlStats?.discovered ?? 0,
|
|
1826
1826
|
crawl_fetched: data.crawlStats?.fetched ?? 0,
|
|
1827
|
-
crawl_skipped: data.crawlStats?.skipped ?? 0
|
|
1827
|
+
crawl_skipped: data.crawlStats?.skipped ?? 0,
|
|
1828
|
+
// V2 criteria fields
|
|
1829
|
+
citation_ready_sentences: (() => {
|
|
1830
|
+
const combinedText = text + " " + (data.blogSample?.map((p) => p.text.replace(/<[^>]*>/g, " ")).join(" ") || "");
|
|
1831
|
+
return (combinedText.match(/\b\w+\s+(is\s+(?:a|an)\s|refers\s+to|defined\s+as)\b/gi) || []).length;
|
|
1832
|
+
})(),
|
|
1833
|
+
answer_first_ratio: (() => {
|
|
1834
|
+
const pages = [html, ...data.blogSample?.map((p) => p.text) || []];
|
|
1835
|
+
let answerFirst = 0;
|
|
1836
|
+
for (const pageHtml of pages) {
|
|
1837
|
+
const bodyMatch = pageHtml.match(/<body[^>]*>([\s\S]*)/i);
|
|
1838
|
+
const bodyHtml = bodyMatch ? bodyMatch[1] : pageHtml;
|
|
1839
|
+
const earlyParas = bodyHtml.match(/<p[^>]*>([\s\S]*?)<\/p>/gi)?.slice(0, 5) || [];
|
|
1840
|
+
for (const p of earlyParas) {
|
|
1841
|
+
const pText = p.replace(/<[^>]*>/g, "").trim();
|
|
1842
|
+
const wc = pText.split(/\s+/).length;
|
|
1843
|
+
if (wc >= 40 && wc <= 80) {
|
|
1844
|
+
answerFirst++;
|
|
1845
|
+
break;
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
}
|
|
1849
|
+
return pages.length > 0 ? Math.round(answerFirst / pages.length * 100) : 0;
|
|
1850
|
+
})(),
|
|
1851
|
+
evidence_citations_avg: (() => {
|
|
1852
|
+
const allHtml = html + "\n" + (data.blogSample?.map((p) => p.text).join("\n") || "");
|
|
1853
|
+
const paragraphs = allHtml.match(/<p[^>]*>[\s\S]*?<\/p>/gi) || [];
|
|
1854
|
+
let citations = 0;
|
|
1855
|
+
const domainLower = data.domain.replace(/^www\./, "").toLowerCase();
|
|
1856
|
+
for (const p of paragraphs) {
|
|
1857
|
+
const links = p.match(/<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>/gi) || [];
|
|
1858
|
+
for (const link of links) {
|
|
1859
|
+
const href = link.match(/href=["'](https?:\/\/[^"']+)["']/i);
|
|
1860
|
+
if (href) {
|
|
1861
|
+
try {
|
|
1862
|
+
const ld = new URL(href[1]).hostname.replace(/^www\./, "").toLowerCase();
|
|
1863
|
+
if (ld !== domainLower) citations++;
|
|
1864
|
+
} catch {
|
|
1865
|
+
}
|
|
1866
|
+
}
|
|
1867
|
+
}
|
|
1868
|
+
}
|
|
1869
|
+
const pageCount = Math.max(1, 1 + (data.blogSample?.length ?? 0));
|
|
1870
|
+
return Math.round(citations / pageCount * 10) / 10;
|
|
1871
|
+
})(),
|
|
1872
|
+
entity_disambiguation_ratio: (() => {
|
|
1873
|
+
const pages = [html, ...data.blogSample?.map((p) => p.text) || []];
|
|
1874
|
+
let defined = 0;
|
|
1875
|
+
for (const pageHtml of pages) {
|
|
1876
|
+
const h1Match = pageHtml.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
1877
|
+
if (!h1Match) continue;
|
|
1878
|
+
const h1Text = h1Match[1].replace(/<[^>]*>/g, "").trim();
|
|
1879
|
+
const h1Words = h1Text.split(/\s+/).filter((w) => w.length > 3);
|
|
1880
|
+
const primaryNoun = h1Words.sort((a, b) => b.length - a.length)[0] || "";
|
|
1881
|
+
if (!primaryNoun) continue;
|
|
1882
|
+
const pageText = pageHtml.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").slice(0, 500);
|
|
1883
|
+
if (new RegExp(`\\b${primaryNoun.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b[^.]*\\b(is|refers|defined|means)\\b`, "i").test(pageText)) {
|
|
1884
|
+
defined++;
|
|
1885
|
+
}
|
|
1886
|
+
}
|
|
1887
|
+
return pages.length > 0 ? Math.round(defined / pages.length * 100) : 0;
|
|
1888
|
+
})(),
|
|
1889
|
+
extraction_friction_avg: (() => {
|
|
1890
|
+
const combinedText = text + " " + (data.blogSample?.map((p) => p.text.replace(/<[^>]*>/g, " ")).join(" ") || "");
|
|
1891
|
+
const sentences = combinedText.split(/[.!?]+/).filter((s) => s.trim().length > 5);
|
|
1892
|
+
if (sentences.length === 0) return 0;
|
|
1893
|
+
const totalWords = sentences.reduce((sum, s) => sum + s.trim().split(/\s+/).length, 0);
|
|
1894
|
+
return Math.round(totalWords / sentences.length * 10) / 10;
|
|
1895
|
+
})(),
|
|
1896
|
+
image_figure_ratio: (() => {
|
|
1897
|
+
const combinedHtml = html + "\n" + (data.blogSample?.map((p) => p.text).join("\n") || "");
|
|
1898
|
+
const allImages = combinedHtml.match(/<img\s[^>]*>/gi) || [];
|
|
1899
|
+
if (allImages.length === 0) return 0;
|
|
1900
|
+
const figureBlocks = combinedHtml.match(/<figure[\s\S]*?<\/figure>/gi) || [];
|
|
1901
|
+
const figuresWithCaption = figureBlocks.filter((f) => /<figcaption/i.test(f));
|
|
1902
|
+
return Math.round(figuresWithCaption.length / allImages.length * 100);
|
|
1903
|
+
})()
|
|
1828
1904
|
};
|
|
1829
1905
|
}
|
|
1830
1906
|
function getPageTopicText(html) {
|
|
@@ -4364,8 +4440,8 @@ function extractLinksWithAnchors(html, sourceUrl, domain) {
|
|
|
4364
4440
|
if (href.startsWith("//")) {
|
|
4365
4441
|
fullUrl = `https:${href}`;
|
|
4366
4442
|
} else if (href.startsWith("/")) {
|
|
4367
|
-
if (href
|
|
4368
|
-
fullUrl = `https://${domain}${href}`;
|
|
4443
|
+
if (href.startsWith("/#")) continue;
|
|
4444
|
+
fullUrl = href === "/" ? `https://${domain}` : `https://${domain}${href}`;
|
|
4369
4445
|
} else if (href.startsWith("http")) {
|
|
4370
4446
|
fullUrl = href;
|
|
4371
4447
|
} else if (href.startsWith("#") || href.startsWith("?") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
@@ -4379,7 +4455,7 @@ function extractLinksWithAnchors(html, sourceUrl, domain) {
|
|
|
4379
4455
|
if (linkDomain !== cleanDomain) continue;
|
|
4380
4456
|
parsed.hash = "";
|
|
4381
4457
|
const path = parsed.pathname;
|
|
4382
|
-
if (path === "
|
|
4458
|
+
if (path === "") continue;
|
|
4383
4459
|
if (RESOURCE_EXTENSIONS.test(path)) continue;
|
|
4384
4460
|
if (SKIP_PATH_PATTERNS.test(path)) continue;
|
|
4385
4461
|
const normalized = normalizeUrl(fullUrl);
|