@snap-agent/rag-web 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +5 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +39 -1
- package/dist/index.mjs +39 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
660
660
|
private extractBestContentText;
|
|
661
661
|
private bodyTextLengthHint;
|
|
662
662
|
private extractDocumentFromHtml;
|
|
663
|
+
/**
|
|
664
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
665
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
666
|
+
*/
|
|
667
|
+
private extractHeroImage;
|
|
663
668
|
private looksLikeDynamicShell;
|
|
664
669
|
private diagFromRenderedAttempt;
|
|
665
670
|
private crawlPageSmart;
|
package/dist/index.d.ts
CHANGED
|
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
660
660
|
private extractBestContentText;
|
|
661
661
|
private bodyTextLengthHint;
|
|
662
662
|
private extractDocumentFromHtml;
|
|
663
|
+
/**
|
|
664
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
665
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
666
|
+
*/
|
|
667
|
+
private extractHeroImage;
|
|
663
668
|
private looksLikeDynamicShell;
|
|
664
669
|
private diagFromRenderedAttempt;
|
|
665
670
|
private crawlPageSmart;
|
package/dist/index.js
CHANGED
|
@@ -254,6 +254,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
254
254
|
type: doc.metadata.type,
|
|
255
255
|
title: doc.metadata.title,
|
|
256
256
|
url: doc.metadata.url,
|
|
257
|
+
imageUrl: doc.metadata.imageUrl,
|
|
258
|
+
description: doc.metadata.description,
|
|
257
259
|
score: doc.score
|
|
258
260
|
}))
|
|
259
261
|
}
|
|
@@ -1659,7 +1661,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1659
1661
|
const content = this.extractBestContentText($, config);
|
|
1660
1662
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1661
1663
|
if (!content || content.length < minChars) return null;
|
|
1662
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") ||
|
|
1664
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1665
|
+
this.extractHeroImage($, url) || void 0;
|
|
1663
1666
|
let imageUrl;
|
|
1664
1667
|
if (image) {
|
|
1665
1668
|
try {
|
|
@@ -1668,6 +1671,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1668
1671
|
imageUrl = image;
|
|
1669
1672
|
}
|
|
1670
1673
|
}
|
|
1674
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1671
1675
|
let type = config.defaultType || "page";
|
|
1672
1676
|
if (config.typeFromUrl) {
|
|
1673
1677
|
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
@@ -1686,10 +1690,44 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1686
1690
|
title,
|
|
1687
1691
|
url,
|
|
1688
1692
|
...imageUrl ? { imageUrl } : {},
|
|
1693
|
+
...description ? { description } : {},
|
|
1689
1694
|
...config.metadata
|
|
1690
1695
|
}
|
|
1691
1696
|
};
|
|
1692
1697
|
}
|
|
1698
|
+
/**
|
|
1699
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1700
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1701
|
+
*/
|
|
1702
|
+
extractHeroImage($, pageUrl) {
|
|
1703
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
1704
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
1705
|
+
let best;
|
|
1706
|
+
scope.find("img[src]").each((_, el) => {
|
|
1707
|
+
if (best) return false;
|
|
1708
|
+
const src = $(el).attr("src") || "";
|
|
1709
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1710
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1711
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1712
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1713
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1714
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1715
|
+
if (src.includes("/_next/image")) {
|
|
1716
|
+
try {
|
|
1717
|
+
const nextUrl = new URL(src, pageUrl);
|
|
1718
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
1719
|
+
if (realUrl) {
|
|
1720
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1721
|
+
return false;
|
|
1722
|
+
}
|
|
1723
|
+
} catch {
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
best = src;
|
|
1727
|
+
return false;
|
|
1728
|
+
});
|
|
1729
|
+
return best;
|
|
1730
|
+
}
|
|
1693
1731
|
looksLikeDynamicShell(html) {
|
|
1694
1732
|
const lower = html.toLowerCase();
|
|
1695
1733
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
package/dist/index.mjs
CHANGED
|
@@ -218,6 +218,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
218
218
|
type: doc.metadata.type,
|
|
219
219
|
title: doc.metadata.title,
|
|
220
220
|
url: doc.metadata.url,
|
|
221
|
+
imageUrl: doc.metadata.imageUrl,
|
|
222
|
+
description: doc.metadata.description,
|
|
221
223
|
score: doc.score
|
|
222
224
|
}))
|
|
223
225
|
}
|
|
@@ -1623,7 +1625,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1623
1625
|
const content = this.extractBestContentText($, config);
|
|
1624
1626
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1625
1627
|
if (!content || content.length < minChars) return null;
|
|
1626
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") ||
|
|
1628
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1629
|
+
this.extractHeroImage($, url) || void 0;
|
|
1627
1630
|
let imageUrl;
|
|
1628
1631
|
if (image) {
|
|
1629
1632
|
try {
|
|
@@ -1632,6 +1635,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1632
1635
|
imageUrl = image;
|
|
1633
1636
|
}
|
|
1634
1637
|
}
|
|
1638
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1635
1639
|
let type = config.defaultType || "page";
|
|
1636
1640
|
if (config.typeFromUrl) {
|
|
1637
1641
|
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
@@ -1650,10 +1654,44 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1650
1654
|
title,
|
|
1651
1655
|
url,
|
|
1652
1656
|
...imageUrl ? { imageUrl } : {},
|
|
1657
|
+
...description ? { description } : {},
|
|
1653
1658
|
...config.metadata
|
|
1654
1659
|
}
|
|
1655
1660
|
};
|
|
1656
1661
|
}
|
|
1662
|
+
/**
|
|
1663
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1664
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1665
|
+
*/
|
|
1666
|
+
extractHeroImage($, pageUrl) {
|
|
1667
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
1668
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
1669
|
+
let best;
|
|
1670
|
+
scope.find("img[src]").each((_, el) => {
|
|
1671
|
+
if (best) return false;
|
|
1672
|
+
const src = $(el).attr("src") || "";
|
|
1673
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1674
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1675
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1676
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1677
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1678
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1679
|
+
if (src.includes("/_next/image")) {
|
|
1680
|
+
try {
|
|
1681
|
+
const nextUrl = new URL(src, pageUrl);
|
|
1682
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
1683
|
+
if (realUrl) {
|
|
1684
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1685
|
+
return false;
|
|
1686
|
+
}
|
|
1687
|
+
} catch {
|
|
1688
|
+
}
|
|
1689
|
+
}
|
|
1690
|
+
best = src;
|
|
1691
|
+
return false;
|
|
1692
|
+
});
|
|
1693
|
+
return best;
|
|
1694
|
+
}
|
|
1657
1695
|
looksLikeDynamicShell(html) {
|
|
1658
1696
|
const lower = html.toLowerCase();
|
|
1659
1697
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|