@snap-agent/rag-web 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +5 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +37 -1
- package/dist/index.mjs +37 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
660
660
|
private extractBestContentText;
|
|
661
661
|
private bodyTextLengthHint;
|
|
662
662
|
private extractDocumentFromHtml;
|
|
663
|
+
/**
|
|
664
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
665
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
666
|
+
*/
|
|
667
|
+
private extractHeroImage;
|
|
663
668
|
private looksLikeDynamicShell;
|
|
664
669
|
private diagFromRenderedAttempt;
|
|
665
670
|
private crawlPageSmart;
|
package/dist/index.d.ts
CHANGED
|
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
660
660
|
private extractBestContentText;
|
|
661
661
|
private bodyTextLengthHint;
|
|
662
662
|
private extractDocumentFromHtml;
|
|
663
|
+
/**
|
|
664
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
665
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
666
|
+
*/
|
|
667
|
+
private extractHeroImage;
|
|
663
668
|
private looksLikeDynamicShell;
|
|
664
669
|
private diagFromRenderedAttempt;
|
|
665
670
|
private crawlPageSmart;
|
package/dist/index.js
CHANGED
|
@@ -254,6 +254,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
254
254
|
type: doc.metadata.type,
|
|
255
255
|
title: doc.metadata.title,
|
|
256
256
|
url: doc.metadata.url,
|
|
257
|
+
imageUrl: doc.metadata.imageUrl,
|
|
258
|
+
description: doc.metadata.description,
|
|
257
259
|
score: doc.score
|
|
258
260
|
}))
|
|
259
261
|
}
|
|
@@ -1659,7 +1661,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1659
1661
|
const content = this.extractBestContentText($, config);
|
|
1660
1662
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1661
1663
|
if (!content || content.length < minChars) return null;
|
|
1662
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") ||
|
|
1664
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1665
|
+
this.extractHeroImage($, url) || void 0;
|
|
1663
1666
|
let imageUrl;
|
|
1664
1667
|
if (image) {
|
|
1665
1668
|
try {
|
|
@@ -1692,6 +1695,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1692
1695
|
}
|
|
1693
1696
|
};
|
|
1694
1697
|
}
|
|
1698
|
+
/**
|
|
1699
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1700
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1701
|
+
*/
|
|
1702
|
+
extractHeroImage($, pageUrl) {
|
|
1703
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
1704
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
1705
|
+
let best;
|
|
1706
|
+
scope.find("img[src]").each((_, el) => {
|
|
1707
|
+
if (best) return false;
|
|
1708
|
+
const src = $(el).attr("src") || "";
|
|
1709
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1710
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1711
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1712
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1713
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1714
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1715
|
+
if (src.includes("/_next/image")) {
|
|
1716
|
+
try {
|
|
1717
|
+
const nextUrl = new URL(src, pageUrl);
|
|
1718
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
1719
|
+
if (realUrl) {
|
|
1720
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1721
|
+
return false;
|
|
1722
|
+
}
|
|
1723
|
+
} catch {
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
best = src;
|
|
1727
|
+
return false;
|
|
1728
|
+
});
|
|
1729
|
+
return best;
|
|
1730
|
+
}
|
|
1695
1731
|
looksLikeDynamicShell(html) {
|
|
1696
1732
|
const lower = html.toLowerCase();
|
|
1697
1733
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
package/dist/index.mjs
CHANGED
|
@@ -218,6 +218,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
218
218
|
type: doc.metadata.type,
|
|
219
219
|
title: doc.metadata.title,
|
|
220
220
|
url: doc.metadata.url,
|
|
221
|
+
imageUrl: doc.metadata.imageUrl,
|
|
222
|
+
description: doc.metadata.description,
|
|
221
223
|
score: doc.score
|
|
222
224
|
}))
|
|
223
225
|
}
|
|
@@ -1623,7 +1625,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1623
1625
|
const content = this.extractBestContentText($, config);
|
|
1624
1626
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1625
1627
|
if (!content || content.length < minChars) return null;
|
|
1626
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") ||
|
|
1628
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1629
|
+
this.extractHeroImage($, url) || void 0;
|
|
1627
1630
|
let imageUrl;
|
|
1628
1631
|
if (image) {
|
|
1629
1632
|
try {
|
|
@@ -1656,6 +1659,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1656
1659
|
}
|
|
1657
1660
|
};
|
|
1658
1661
|
}
|
|
1662
|
+
/**
|
|
1663
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1664
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1665
|
+
*/
|
|
1666
|
+
extractHeroImage($, pageUrl) {
|
|
1667
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
1668
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
1669
|
+
let best;
|
|
1670
|
+
scope.find("img[src]").each((_, el) => {
|
|
1671
|
+
if (best) return false;
|
|
1672
|
+
const src = $(el).attr("src") || "";
|
|
1673
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1674
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1675
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1676
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1677
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1678
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1679
|
+
if (src.includes("/_next/image")) {
|
|
1680
|
+
try {
|
|
1681
|
+
const nextUrl = new URL(src, pageUrl);
|
|
1682
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
1683
|
+
if (realUrl) {
|
|
1684
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1685
|
+
return false;
|
|
1686
|
+
}
|
|
1687
|
+
} catch {
|
|
1688
|
+
}
|
|
1689
|
+
}
|
|
1690
|
+
best = src;
|
|
1691
|
+
return false;
|
|
1692
|
+
});
|
|
1693
|
+
return best;
|
|
1694
|
+
}
|
|
1659
1695
|
looksLikeDynamicShell(html) {
|
|
1660
1696
|
const lower = html.toLowerCase();
|
|
1661
1697
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|