@snap-agent/rag-web 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
660
660
  private extractBestContentText;
661
661
  private bodyTextLengthHint;
662
662
  private extractDocumentFromHtml;
663
+ /**
664
+ * Fallback image extraction: finds the first meaningful image in the content area.
665
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
666
+ */
667
+ private extractHeroImage;
663
668
  private looksLikeDynamicShell;
664
669
  private diagFromRenderedAttempt;
665
670
  private crawlPageSmart;
package/dist/index.d.ts CHANGED
@@ -660,6 +660,11 @@ declare class WebRAGPlugin implements RAGPlugin {
660
660
  private extractBestContentText;
661
661
  private bodyTextLengthHint;
662
662
  private extractDocumentFromHtml;
663
+ /**
664
+ * Fallback image extraction: finds the first meaningful image in the content area.
665
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
666
+ */
667
+ private extractHeroImage;
663
668
  private looksLikeDynamicShell;
664
669
  private diagFromRenderedAttempt;
665
670
  private crawlPageSmart;
package/dist/index.js CHANGED
@@ -254,6 +254,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
254
254
  type: doc.metadata.type,
255
255
  title: doc.metadata.title,
256
256
  url: doc.metadata.url,
257
+ imageUrl: doc.metadata.imageUrl,
258
+ description: doc.metadata.description,
257
259
  score: doc.score
258
260
  }))
259
261
  }
@@ -1659,7 +1661,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1659
1661
  const content = this.extractBestContentText($, config);
1660
1662
  const minChars = config.minExtractedContentLength ?? 50;
1661
1663
  if (!content || content.length < minChars) return null;
1662
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
1664
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1665
+ this.extractHeroImage($, url) || void 0;
1663
1666
  let imageUrl;
1664
1667
  if (image) {
1665
1668
  try {
@@ -1668,6 +1671,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1668
1671
  imageUrl = image;
1669
1672
  }
1670
1673
  }
1674
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1671
1675
  let type = config.defaultType || "page";
1672
1676
  if (config.typeFromUrl) {
1673
1677
  for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
@@ -1686,10 +1690,44 @@ var WebRAGPlugin = class _WebRAGPlugin {
1686
1690
  title,
1687
1691
  url,
1688
1692
  ...imageUrl ? { imageUrl } : {},
1693
+ ...description ? { description } : {},
1689
1694
  ...config.metadata
1690
1695
  }
1691
1696
  };
1692
1697
  }
1698
+ /**
1699
+ * Fallback image extraction: finds the first meaningful image in the content area.
1700
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
1701
+ */
1702
+ extractHeroImage($, pageUrl) {
1703
+ const containers = $('main, article, [role="main"], #content, .content');
1704
+ const scope = containers.length > 0 ? containers : $("body");
1705
+ let best;
1706
+ scope.find("img[src]").each((_, el) => {
1707
+ if (best) return false;
1708
+ const src = $(el).attr("src") || "";
1709
+ const alt = ($(el).attr("alt") || "").toLowerCase();
1710
+ const width = parseInt($(el).attr("width") || "0", 10);
1711
+ const height = parseInt($(el).attr("height") || "0", 10);
1712
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
1713
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1714
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
1715
+ if (src.includes("/_next/image")) {
1716
+ try {
1717
+ const nextUrl = new URL(src, pageUrl);
1718
+ const realUrl = nextUrl.searchParams.get("url");
1719
+ if (realUrl) {
1720
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1721
+ return false;
1722
+ }
1723
+ } catch {
1724
+ }
1725
+ }
1726
+ best = src;
1727
+ return false;
1728
+ });
1729
+ return best;
1730
+ }
1693
1731
  looksLikeDynamicShell(html) {
1694
1732
  const lower = html.toLowerCase();
1695
1733
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
package/dist/index.mjs CHANGED
@@ -218,6 +218,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
218
218
  type: doc.metadata.type,
219
219
  title: doc.metadata.title,
220
220
  url: doc.metadata.url,
221
+ imageUrl: doc.metadata.imageUrl,
222
+ description: doc.metadata.description,
221
223
  score: doc.score
222
224
  }))
223
225
  }
@@ -1623,7 +1625,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1623
1625
  const content = this.extractBestContentText($, config);
1624
1626
  const minChars = config.minExtractedContentLength ?? 50;
1625
1627
  if (!content || content.length < minChars) return null;
1626
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
1628
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1629
+ this.extractHeroImage($, url) || void 0;
1627
1630
  let imageUrl;
1628
1631
  if (image) {
1629
1632
  try {
@@ -1632,6 +1635,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1632
1635
  imageUrl = image;
1633
1636
  }
1634
1637
  }
1638
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1635
1639
  let type = config.defaultType || "page";
1636
1640
  if (config.typeFromUrl) {
1637
1641
  for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
@@ -1650,10 +1654,44 @@ var WebRAGPlugin = class _WebRAGPlugin {
1650
1654
  title,
1651
1655
  url,
1652
1656
  ...imageUrl ? { imageUrl } : {},
1657
+ ...description ? { description } : {},
1653
1658
  ...config.metadata
1654
1659
  }
1655
1660
  };
1656
1661
  }
1662
+ /**
1663
+ * Fallback image extraction: finds the first meaningful image in the content area.
1664
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
1665
+ */
1666
+ extractHeroImage($, pageUrl) {
1667
+ const containers = $('main, article, [role="main"], #content, .content');
1668
+ const scope = containers.length > 0 ? containers : $("body");
1669
+ let best;
1670
+ scope.find("img[src]").each((_, el) => {
1671
+ if (best) return false;
1672
+ const src = $(el).attr("src") || "";
1673
+ const alt = ($(el).attr("alt") || "").toLowerCase();
1674
+ const width = parseInt($(el).attr("width") || "0", 10);
1675
+ const height = parseInt($(el).attr("height") || "0", 10);
1676
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
1677
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1678
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
1679
+ if (src.includes("/_next/image")) {
1680
+ try {
1681
+ const nextUrl = new URL(src, pageUrl);
1682
+ const realUrl = nextUrl.searchParams.get("url");
1683
+ if (realUrl) {
1684
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1685
+ return false;
1686
+ }
1687
+ } catch {
1688
+ }
1689
+ }
1690
+ best = src;
1691
+ return false;
1692
+ });
1693
+ return best;
1694
+ }
1657
1695
  looksLikeDynamicShell(html) {
1658
1696
  const lower = html.toLowerCase();
1659
1697
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.2",
3
+ "version": "0.1.4",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",