@snap-agent/rag-web 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
660
660
  private extractBestContentText;
661
661
  private bodyTextLengthHint;
662
662
  private extractDocumentFromHtml;
663
+ /**
664
+ * Fallback image extraction: finds the first meaningful image in the content area.
665
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
666
+ */
667
+ private extractHeroImage;
663
668
  private looksLikeDynamicShell;
664
669
  private diagFromRenderedAttempt;
665
670
  private crawlPageSmart;
package/dist/index.d.ts CHANGED
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
660
660
  private extractBestContentText;
661
661
  private bodyTextLengthHint;
662
662
  private extractDocumentFromHtml;
663
+ /**
664
+ * Fallback image extraction: finds the first meaningful image in the content area.
665
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
666
+ */
667
+ private extractHeroImage;
663
668
  private looksLikeDynamicShell;
664
669
  private diagFromRenderedAttempt;
665
670
  private crawlPageSmart;
package/dist/index.js CHANGED
@@ -254,6 +254,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
254
254
  type: doc.metadata.type,
255
255
  title: doc.metadata.title,
256
256
  url: doc.metadata.url,
257
+ imageUrl: doc.metadata.imageUrl,
258
+ description: doc.metadata.description,
257
259
  score: doc.score
258
260
  }))
259
261
  }
@@ -1659,7 +1661,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1659
1661
  const content = this.extractBestContentText($, config);
1660
1662
  const minChars = config.minExtractedContentLength ?? 50;
1661
1663
  if (!content || content.length < minChars) return null;
1662
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
1664
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1665
+ this.extractHeroImage($, url) || void 0;
1663
1666
  let imageUrl;
1664
1667
  if (image) {
1665
1668
  try {
@@ -1692,6 +1695,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
1692
1695
  }
1693
1696
  };
1694
1697
  }
1698
+ /**
1699
+ * Fallback image extraction: finds the first meaningful image in the content area.
1700
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
1701
+ */
1702
+ extractHeroImage($, pageUrl) {
1703
+ const containers = $('main, article, [role="main"], #content, .content');
1704
+ const scope = containers.length > 0 ? containers : $("body");
1705
+ let best;
1706
+ scope.find("img[src]").each((_, el) => {
1707
+ if (best) return false;
1708
+ const src = $(el).attr("src") || "";
1709
+ const alt = ($(el).attr("alt") || "").toLowerCase();
1710
+ const width = parseInt($(el).attr("width") || "0", 10);
1711
+ const height = parseInt($(el).attr("height") || "0", 10);
1712
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
1713
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1714
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
1715
+ if (src.includes("/_next/image")) {
1716
+ try {
1717
+ const nextUrl = new URL(src, pageUrl);
1718
+ const realUrl = nextUrl.searchParams.get("url");
1719
+ if (realUrl) {
1720
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1721
+ return false;
1722
+ }
1723
+ } catch {
1724
+ }
1725
+ }
1726
+ best = src;
1727
+ return false;
1728
+ });
1729
+ return best;
1730
+ }
1695
1731
  looksLikeDynamicShell(html) {
1696
1732
  const lower = html.toLowerCase();
1697
1733
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
package/dist/index.mjs CHANGED
@@ -218,6 +218,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
218
218
  type: doc.metadata.type,
219
219
  title: doc.metadata.title,
220
220
  url: doc.metadata.url,
221
+ imageUrl: doc.metadata.imageUrl,
222
+ description: doc.metadata.description,
221
223
  score: doc.score
222
224
  }))
223
225
  }
@@ -1623,7 +1625,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1623
1625
  const content = this.extractBestContentText($, config);
1624
1626
  const minChars = config.minExtractedContentLength ?? 50;
1625
1627
  if (!content || content.length < minChars) return null;
1626
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
1628
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1629
+ this.extractHeroImage($, url) || void 0;
1627
1630
  let imageUrl;
1628
1631
  if (image) {
1629
1632
  try {
@@ -1656,6 +1659,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
1656
1659
  }
1657
1660
  };
1658
1661
  }
1662
+ /**
1663
+ * Fallback image extraction: finds the first meaningful image in the content area.
1664
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
1665
+ */
1666
+ extractHeroImage($, pageUrl) {
1667
+ const containers = $('main, article, [role="main"], #content, .content');
1668
+ const scope = containers.length > 0 ? containers : $("body");
1669
+ let best;
1670
+ scope.find("img[src]").each((_, el) => {
1671
+ if (best) return false;
1672
+ const src = $(el).attr("src") || "";
1673
+ const alt = ($(el).attr("alt") || "").toLowerCase();
1674
+ const width = parseInt($(el).attr("width") || "0", 10);
1675
+ const height = parseInt($(el).attr("height") || "0", 10);
1676
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
1677
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1678
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
1679
+ if (src.includes("/_next/image")) {
1680
+ try {
1681
+ const nextUrl = new URL(src, pageUrl);
1682
+ const realUrl = nextUrl.searchParams.get("url");
1683
+ if (realUrl) {
1684
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1685
+ return false;
1686
+ }
1687
+ } catch {
1688
+ }
1689
+ }
1690
+ best = src;
1691
+ return false;
1692
+ });
1693
+ return best;
1694
+ }
1659
1695
  looksLikeDynamicShell(html) {
1660
1696
  const lower = html.toLowerCase();
1661
1697
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.3",
3
+ "version": "0.1.4",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",