@snap-agent/rag-web 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
690
690
  * Crawl a single page and extract content
691
691
  */
692
692
  private crawlPage;
693
- /**
694
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
695
- * would otherwise hit an empty wrapper.
696
- */
697
- private static readonly DEFAULT_CONTENT_SELECTOR;
698
- private stripNoiseFromDom;
699
- /** Longest cleaned text among selector matches and full body (after noise strip). */
700
- private extractBestContentText;
701
693
  private bodyTextLengthHint;
702
694
  private extractDocumentFromHtml;
703
- /**
704
- * Fallback image extraction: finds the first meaningful image in the content area.
705
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
706
- */
707
- private extractHeroImage;
708
695
  private looksLikeDynamicShell;
709
696
  private diagFromRenderedAttempt;
710
697
  private crawlPageSmart;
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
717
704
  /**
718
705
  * Clean extracted text content
719
706
  */
720
- private cleanContent;
721
- /**
722
- * Convert URL to a stable document ID
723
- */
724
707
  private urlToId;
725
708
  /**
726
709
  * Delay helper
@@ -783,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
783
766
  getConfig(): Record<string, any>;
784
767
  }
785
768
 
786
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
769
+ interface HtmlPageExtractOptions {
770
+ titleSelector?: string;
771
+ contentSelector?: string;
772
+ removeSelectors?: string[];
773
+ defaultType?: string;
774
+ typeFromUrl?: Record<string, string>;
775
+ minExtractedContentLength?: number;
776
+ metadata?: Record<string, unknown>;
777
+ }
778
+ interface HtmlPageExtractResult {
779
+ id: string;
780
+ metadata: Record<string, unknown>;
781
+ content: string;
782
+ /** True when content meets minExtractedContentLength (default 50). */
783
+ indexable: boolean;
784
+ contentPreview: string;
785
+ }
786
+ declare function urlToDocumentId(url: string): string;
787
+ declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
788
+ /**
789
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
790
+ * Unlike ingest, always returns metadata even when content is too short to index.
791
+ */
792
+ declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
793
+
794
+ interface ProductMetadata {
795
+ price?: number;
796
+ currency?: string;
797
+ availability?: string;
798
+ }
799
+ /**
800
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
801
+ * Per-field priority: JSON-LD → Open Graph → microdata.
802
+ */
803
+ declare function extractProductMetadata(html: string): ProductMetadata;
804
+ declare function parsePrice(value: unknown): number | undefined;
805
+ declare function normalizeCurrency(value: unknown): string | undefined;
806
+ declare function normalizeAvailability(value: unknown): string | undefined;
807
+
808
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
package/dist/index.d.ts CHANGED
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
690
690
  * Crawl a single page and extract content
691
691
  */
692
692
  private crawlPage;
693
- /**
694
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
695
- * would otherwise hit an empty wrapper.
696
- */
697
- private static readonly DEFAULT_CONTENT_SELECTOR;
698
- private stripNoiseFromDom;
699
- /** Longest cleaned text among selector matches and full body (after noise strip). */
700
- private extractBestContentText;
701
693
  private bodyTextLengthHint;
702
694
  private extractDocumentFromHtml;
703
- /**
704
- * Fallback image extraction: finds the first meaningful image in the content area.
705
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
706
- */
707
- private extractHeroImage;
708
695
  private looksLikeDynamicShell;
709
696
  private diagFromRenderedAttempt;
710
697
  private crawlPageSmart;
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
717
704
  /**
718
705
  * Clean extracted text content
719
706
  */
720
- private cleanContent;
721
- /**
722
- * Convert URL to a stable document ID
723
- */
724
707
  private urlToId;
725
708
  /**
726
709
  * Delay helper
@@ -783,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
783
766
  getConfig(): Record<string, any>;
784
767
  }
785
768
 
786
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
769
+ interface HtmlPageExtractOptions {
770
+ titleSelector?: string;
771
+ contentSelector?: string;
772
+ removeSelectors?: string[];
773
+ defaultType?: string;
774
+ typeFromUrl?: Record<string, string>;
775
+ minExtractedContentLength?: number;
776
+ metadata?: Record<string, unknown>;
777
+ }
778
+ interface HtmlPageExtractResult {
779
+ id: string;
780
+ metadata: Record<string, unknown>;
781
+ content: string;
782
+ /** True when content meets minExtractedContentLength (default 50). */
783
+ indexable: boolean;
784
+ contentPreview: string;
785
+ }
786
+ declare function urlToDocumentId(url: string): string;
787
+ declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
788
+ /**
789
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
790
+ * Unlike ingest, always returns metadata even when content is too short to index.
791
+ */
792
+ declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
793
+
794
+ interface ProductMetadata {
795
+ price?: number;
796
+ currency?: string;
797
+ availability?: string;
798
+ }
799
+ /**
800
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
801
+ * Per-field priority: JSON-LD → Open Graph → microdata.
802
+ */
803
+ declare function extractProductMetadata(html: string): ProductMetadata;
804
+ declare function parsePrice(value: unknown): number | undefined;
805
+ declare function normalizeCurrency(value: unknown): string | undefined;
806
+ declare function normalizeAvailability(value: unknown): string | undefined;
807
+
808
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
package/dist/index.js CHANGED
@@ -30,16 +30,319 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
- WebRAGPlugin: () => WebRAGPlugin
33
+ WebRAGPlugin: () => WebRAGPlugin,
34
+ bodyTextLengthHint: () => bodyTextLengthHint,
35
+ extractPageFromHtml: () => extractPageFromHtml,
36
+ extractProductMetadata: () => extractProductMetadata,
37
+ normalizeAvailability: () => normalizeAvailability,
38
+ normalizeCurrency: () => normalizeCurrency,
39
+ parsePrice: () => parsePrice,
40
+ urlToDocumentId: () => urlToDocumentId
34
41
  });
35
42
  module.exports = __toCommonJS(index_exports);
36
43
 
37
44
  // src/WebRAGPlugin.ts
38
45
  var import_mongodb = require("mongodb");
39
46
  var import_openai = __toESM(require("openai"));
40
- var cheerio = __toESM(require("cheerio"));
47
+ var cheerio3 = __toESM(require("cheerio"));
41
48
  var fs = __toESM(require("fs"));
42
49
  var path = __toESM(require("path"));
50
+
51
+ // src/htmlPageExtract.ts
52
+ var cheerio2 = __toESM(require("cheerio"));
53
+
54
+ // src/productMetadata.ts
55
+ var cheerio = __toESM(require("cheerio"));
56
+ function extractProductMetadata(html) {
57
+ const $ = cheerio.load(html);
58
+ const fromJsonLd = extractFromJsonLd($);
59
+ const fromOg = extractFromOpenGraph($);
60
+ const fromMicrodata = extractFromMicrodata($);
61
+ const result = {};
62
+ const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
63
+ if (price != null) result.price = price;
64
+ const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
65
+ if (currency) result.currency = currency;
66
+ const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
67
+ if (availability) result.availability = availability;
68
+ return result;
69
+ }
70
+ function extractFromJsonLd($) {
71
+ const result = {};
72
+ $('script[type="application/ld+json"]').each((_, el) => {
73
+ if (result.price != null && result.currency && result.availability) return false;
74
+ const raw = $(el).html()?.trim();
75
+ if (!raw) return;
76
+ let parsed;
77
+ try {
78
+ parsed = JSON.parse(raw);
79
+ } catch {
80
+ return;
81
+ }
82
+ for (const node of collectJsonLdNodes(parsed)) {
83
+ if (!isProductType(node)) continue;
84
+ const offer = pickOffer(node);
85
+ if (!offer) continue;
86
+ if (result.price == null) {
87
+ const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
88
+ if (price != null) result.price = price;
89
+ }
90
+ if (!result.currency) {
91
+ const currency = normalizeCurrency(offer.priceCurrency);
92
+ if (currency) result.currency = currency;
93
+ }
94
+ if (!result.availability) {
95
+ const availability = normalizeAvailability(offer.availability);
96
+ if (availability) result.availability = availability;
97
+ }
98
+ }
99
+ });
100
+ return result;
101
+ }
102
+ function extractFromOpenGraph($) {
103
+ const result = {};
104
+ const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
105
+ const price = parsePrice(priceRaw);
106
+ if (price != null) result.price = price;
107
+ const currency = normalizeCurrency(
108
+ $('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
109
+ );
110
+ if (currency) result.currency = currency;
111
+ const availability = normalizeAvailability(
112
+ $('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
113
+ );
114
+ if (availability) result.availability = availability;
115
+ return result;
116
+ }
117
+ function microdataField($, itemprop) {
118
+ const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
119
+ return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
120
+ }
121
+ function extractFromMicrodata($) {
122
+ const result = {};
123
+ const priceEl = microdataField($, "price");
124
+ const price = parsePrice(priceEl.attr("content") || priceEl.text());
125
+ if (price != null) result.price = price;
126
+ const currencyEl = microdataField($, "priceCurrency");
127
+ const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
128
+ if (currency) result.currency = currency;
129
+ const availabilityEl = microdataField($, "availability");
130
+ const availability = normalizeAvailability(
131
+ availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
132
+ );
133
+ if (availability) result.availability = availability;
134
+ return result;
135
+ }
136
+ function collectJsonLdNodes(data) {
137
+ const nodes = [];
138
+ const visit = (value) => {
139
+ if (value == null) return;
140
+ if (Array.isArray(value)) {
141
+ value.forEach(visit);
142
+ return;
143
+ }
144
+ if (typeof value !== "object") return;
145
+ const obj = value;
146
+ nodes.push(obj);
147
+ if (obj["@graph"]) visit(obj["@graph"]);
148
+ };
149
+ visit(data);
150
+ return nodes;
151
+ }
152
+ function isProductType(node) {
153
+ const type = node["@type"];
154
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
155
+ return types.some((t) => {
156
+ const s = String(t).toLowerCase();
157
+ return s === "product" || s.endsWith("/product");
158
+ });
159
+ }
160
+ function pickOffer(product) {
161
+ const offers = product.offers;
162
+ if (offers == null) return null;
163
+ if (Array.isArray(offers)) {
164
+ const first = offers.find((o) => o && typeof o === "object");
165
+ return first ?? null;
166
+ }
167
+ if (typeof offers === "object") return offers;
168
+ return null;
169
+ }
170
+ function parsePrice(value) {
171
+ if (value == null || value === "") return void 0;
172
+ if (typeof value === "number" && Number.isFinite(value)) return value;
173
+ let s = String(value).trim();
174
+ if (!s) return void 0;
175
+ s = s.replace(/[^\d.,\-]/g, "");
176
+ if (!s || s === "-" || s === ".") return void 0;
177
+ const lastComma = s.lastIndexOf(",");
178
+ const lastDot = s.lastIndexOf(".");
179
+ if (lastComma > -1 && lastDot > -1) {
180
+ if (lastComma > lastDot) {
181
+ s = s.replace(/\./g, "").replace(",", ".");
182
+ } else {
183
+ s = s.replace(/,/g, "");
184
+ }
185
+ } else if (lastComma > -1) {
186
+ const parts = s.split(",");
187
+ if (parts.length === 2 && parts[1].length <= 2) {
188
+ s = parts[0].replace(/\./g, "") + "." + parts[1];
189
+ } else {
190
+ s = s.replace(/,/g, "");
191
+ }
192
+ }
193
+ const num = parseFloat(s);
194
+ return Number.isFinite(num) ? num : void 0;
195
+ }
196
+ function normalizeCurrency(value) {
197
+ if (value == null) return void 0;
198
+ const s = String(value).trim().toUpperCase();
199
+ if (!s) return void 0;
200
+ const iso = s.match(/[A-Z]{3}/);
201
+ return iso ? iso[0] : s.length <= 4 ? s : void 0;
202
+ }
203
+ function normalizeAvailability(value) {
204
+ if (value == null) return void 0;
205
+ let s = String(value).trim();
206
+ if (!s) return void 0;
207
+ if (s.includes("schema.org/")) {
208
+ const parts = s.split("/");
209
+ s = parts[parts.length - 1] || s;
210
+ }
211
+ s = s.replace(/^https?:\/\/[^/]+\//, "");
212
+ if (s.includes("/")) {
213
+ const parts = s.split("/");
214
+ s = parts[parts.length - 1] || s;
215
+ }
216
+ return s.replace(/\s+/g, "") || void 0;
217
+ }
218
+
219
+ // src/htmlPageExtract.ts
220
+ var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
221
+ var DEFAULT_REMOVE_SELECTORS = [
222
+ "script",
223
+ "style",
224
+ "nav",
225
+ "header",
226
+ "footer",
227
+ ".sidebar",
228
+ ".navigation",
229
+ ".menu",
230
+ ".comments",
231
+ '[role="navigation"]',
232
+ '[role="banner"]'
233
+ ];
234
+ function urlToDocumentId(url) {
235
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
236
+ }
237
+ function cleanContent(text) {
238
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
239
+ }
240
+ function bodyTextLengthHint(html, options = {}) {
241
+ const $ = cheerio2.load(html);
242
+ stripNoiseFromDom($, options);
243
+ return cleanContent($("body").text().trim()).length;
244
+ }
245
+ function extractPageFromHtml(url, html, options = {}) {
246
+ const $ = cheerio2.load(html);
247
+ stripNoiseFromDom($, options);
248
+ const titleSelector = options.titleSelector || "h1, title";
249
+ let title = $(titleSelector).first().text().trim();
250
+ if (!title) {
251
+ title = $("title").text().trim();
252
+ }
253
+ const content = extractBestContentText($, options);
254
+ const minChars = options.minExtractedContentLength ?? 50;
255
+ const indexable = Boolean(content && content.length >= minChars);
256
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
257
+ let imageUrl;
258
+ if (image) {
259
+ try {
260
+ imageUrl = new URL(image, url).href;
261
+ } catch {
262
+ imageUrl = image;
263
+ }
264
+ }
265
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
266
+ let type = options.defaultType || "page";
267
+ if (options.typeFromUrl) {
268
+ for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
269
+ if (url.includes(pattern)) {
270
+ type = typeName;
271
+ break;
272
+ }
273
+ }
274
+ }
275
+ const productMeta = extractProductMetadata(html);
276
+ const metadata = {
277
+ type,
278
+ ...title ? { title } : {},
279
+ url,
280
+ ...imageUrl ? { imageUrl } : {},
281
+ ...description ? { description } : {},
282
+ ...productMeta.price != null ? { price: productMeta.price } : {},
283
+ ...productMeta.currency ? { currency: productMeta.currency } : {},
284
+ ...productMeta.availability ? { availability: productMeta.availability } : {},
285
+ ...options.metadata
286
+ };
287
+ const previewLen = 400;
288
+ const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
289
+ return {
290
+ id: urlToDocumentId(url),
291
+ metadata,
292
+ content,
293
+ indexable,
294
+ contentPreview
295
+ };
296
+ }
297
+ function stripNoiseFromDom($, options) {
298
+ const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
299
+ removeSelectors.forEach((selector) => $(selector).remove());
300
+ }
301
+ function extractBestContentText($, options) {
302
+ const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
303
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
304
+ let best = "";
305
+ for (const sel of selectors) {
306
+ $(sel).each((_, el) => {
307
+ const t = cleanContent($(el).text().trim());
308
+ if (t.length > best.length) best = t;
309
+ });
310
+ }
311
+ const bodyText = cleanContent($("body").text().trim());
312
+ if (bodyText.length > best.length) best = bodyText;
313
+ return best;
314
+ }
315
+ function extractHeroImage($, pageUrl) {
316
+ const containers = $('main, article, [role="main"], #content, .content');
317
+ const scope = containers.length > 0 ? containers : $("body");
318
+ let best;
319
+ scope.find("img[src]").each((_, el) => {
320
+ if (best) return false;
321
+ const src = $(el).attr("src") || "";
322
+ const alt = ($(el).attr("alt") || "").toLowerCase();
323
+ const width = parseInt($(el).attr("width") || "0", 10);
324
+ const height = parseInt($(el).attr("height") || "0", 10);
325
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
326
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
327
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
328
+ if (src.includes("/_next/image")) {
329
+ try {
330
+ const nextUrl = new URL(src, pageUrl);
331
+ const realUrl = nextUrl.searchParams.get("url");
332
+ if (realUrl) {
333
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
334
+ return false;
335
+ }
336
+ } catch {
337
+ }
338
+ }
339
+ best = src;
340
+ return false;
341
+ });
342
+ return best;
343
+ }
344
+
345
+ // src/WebRAGPlugin.ts
43
346
  function bulkOpCurrentUrl(op) {
44
347
  const meta = op.document?.metadata;
45
348
  if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
@@ -58,7 +361,7 @@ function isUrlListingInsert(document) {
58
361
  return false;
59
362
  }
60
363
  }
61
- var WebRAGPlugin = class _WebRAGPlugin {
364
+ var WebRAGPlugin = class {
62
365
  name = "web-rag";
63
366
  type = "rag";
64
367
  priority;
@@ -288,6 +591,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
288
591
  url: doc.metadata.url,
289
592
  imageUrl: doc.metadata.imageUrl,
290
593
  description: doc.metadata.description,
594
+ ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
595
+ ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
596
+ ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
291
597
  score: doc.score
292
598
  }))
293
599
  }
@@ -1458,7 +1764,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1458
1764
  return await response.text();
1459
1765
  }
1460
1766
  extractInternalLinks(html, base, stripQueryParams) {
1461
- const $ = cheerio.load(html);
1767
+ const $ = cheerio3.load(html);
1462
1768
  const links = /* @__PURE__ */ new Set();
1463
1769
  $("a[href]").each((_, el) => {
1464
1770
  const href = ($(el).attr("href") || "").trim();
@@ -1624,7 +1930,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1624
1930
  }
1625
1931
  }
1626
1932
  try {
1627
- const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
1933
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
1628
1934
  renderMode,
1629
1935
  renderOptions,
1630
1936
  minContentLength,
@@ -1655,7 +1961,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1655
1961
  status: crawlSt,
1656
1962
  modeUsed: diag?.modeUsed,
1657
1963
  contentLength: doc?.content?.length,
1658
- bodyTextLengthHint,
1964
+ bodyTextLengthHint: bodyTextLengthHint2,
1659
1965
  title: doc?.metadata?.title,
1660
1966
  docId: doc?.id,
1661
1967
  error: diag?.errorMessage
@@ -1767,125 +2073,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
1767
2073
  const html = await response.text();
1768
2074
  return this.extractDocumentFromHtml(url, html, config);
1769
2075
  }
1770
- /**
1771
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
1772
- * would otherwise hit an empty wrapper.
1773
- */
1774
- static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1775
- stripNoiseFromDom($, config) {
1776
- const removeSelectors = config.removeSelectors || [
1777
- "script",
1778
- "style",
1779
- "nav",
1780
- "header",
1781
- "footer",
1782
- ".sidebar",
1783
- ".navigation",
1784
- ".menu",
1785
- ".comments",
1786
- '[role="navigation"]',
1787
- '[role="banner"]'
1788
- ];
1789
- removeSelectors.forEach((selector) => $(selector).remove());
1790
- }
1791
- /** Longest cleaned text among selector matches and full body (after noise strip). */
1792
- extractBestContentText($, config) {
1793
- const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1794
- const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1795
- let best = "";
1796
- for (const sel of selectors) {
1797
- $(sel).each((_, el) => {
1798
- const t = this.cleanContent($(el).text().trim());
1799
- if (t.length > best.length) best = t;
1800
- });
1801
- }
1802
- const bodyText = this.cleanContent($("body").text().trim());
1803
- if (bodyText.length > best.length) best = bodyText;
1804
- return best;
1805
- }
1806
2076
  bodyTextLengthHint(html, config) {
1807
- const $ = cheerio.load(html);
1808
- this.stripNoiseFromDom($, config);
1809
- return this.cleanContent($("body").text().trim()).length;
2077
+ return bodyTextLengthHint(html, config);
1810
2078
  }
1811
2079
  extractDocumentFromHtml(url, html, config) {
1812
- const $ = cheerio.load(html);
1813
- this.stripNoiseFromDom($, config);
1814
- const titleSelector = config.titleSelector || "h1, title";
1815
- let title = $(titleSelector).first().text().trim();
1816
- if (!title) {
1817
- title = $("title").text().trim();
1818
- }
1819
- const content = this.extractBestContentText($, config);
1820
- const minChars = config.minExtractedContentLength ?? 50;
1821
- if (!content || content.length < minChars) return null;
1822
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1823
- this.extractHeroImage($, url) || void 0;
1824
- let imageUrl;
1825
- if (image) {
1826
- try {
1827
- imageUrl = new URL(image, url).href;
1828
- } catch {
1829
- imageUrl = image;
1830
- }
1831
- }
1832
- const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1833
- let type = config.defaultType || "page";
1834
- if (config.typeFromUrl) {
1835
- for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1836
- if (url.includes(pattern)) {
1837
- type = typeName;
1838
- break;
1839
- }
1840
- }
1841
- }
1842
- const id = this.urlToId(url);
2080
+ const extracted = extractPageFromHtml(url, html, config);
2081
+ if (!extracted.indexable) return null;
1843
2082
  return {
1844
- id,
1845
- content,
1846
- metadata: {
1847
- type,
1848
- title,
1849
- url,
1850
- ...imageUrl ? { imageUrl } : {},
1851
- ...description ? { description } : {},
1852
- ...config.metadata
1853
- }
2083
+ id: extracted.id,
2084
+ content: extracted.content,
2085
+ metadata: extracted.metadata
1854
2086
  };
1855
2087
  }
1856
- /**
1857
- * Fallback image extraction: finds the first meaningful image in the content area.
1858
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
1859
- */
1860
- extractHeroImage($, pageUrl) {
1861
- const containers = $('main, article, [role="main"], #content, .content');
1862
- const scope = containers.length > 0 ? containers : $("body");
1863
- let best;
1864
- scope.find("img[src]").each((_, el) => {
1865
- if (best) return false;
1866
- const src = $(el).attr("src") || "";
1867
- const alt = ($(el).attr("alt") || "").toLowerCase();
1868
- const width = parseInt($(el).attr("width") || "0", 10);
1869
- const height = parseInt($(el).attr("height") || "0", 10);
1870
- if (width > 0 && width < 80 || height > 0 && height < 80) return;
1871
- if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1872
- if (src.startsWith("data:") || src.endsWith(".svg")) return;
1873
- if (src.includes("/_next/image")) {
1874
- try {
1875
- const nextUrl = new URL(src, pageUrl);
1876
- const realUrl = nextUrl.searchParams.get("url");
1877
- if (realUrl) {
1878
- best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1879
- return false;
1880
- }
1881
- } catch {
1882
- }
1883
- }
1884
- best = src;
1885
- return false;
1886
- });
1887
- return best;
1888
- }
1889
2088
  looksLikeDynamicShell(html) {
1890
2089
  const lower = html.toLowerCase();
1891
2090
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1903,7 +2102,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1903
2102
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1904
2103
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1905
2104
  }
1906
- diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
2105
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
1907
2106
  if (blockedSuspected) {
1908
2107
  return {
1909
2108
  doc: null,
@@ -1919,12 +2118,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1919
2118
  return {
1920
2119
  doc,
1921
2120
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1922
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
2121
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
1923
2122
  };
1924
2123
  }
1925
2124
  async crawlPageSmart(url, config, timeout, ctx) {
1926
2125
  if (ctx.renderMode === true) {
1927
- const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2126
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1928
2127
  url,
1929
2128
  config,
1930
2129
  timeout,
@@ -1933,7 +2132,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1933
2132
  );
1934
2133
  return this.diagFromRenderedAttempt(
1935
2134
  doc,
1936
- bodyTextLengthHint,
2135
+ bodyTextLengthHint2,
1937
2136
  renderFailure,
1938
2137
  blockedSuspected,
1939
2138
  "render_ok",
@@ -2050,7 +2249,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2050
2249
  }
2051
2250
  }
2052
2251
  const html = await page.content();
2053
- const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
2252
+ const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
2054
2253
  const doc = this.extractDocumentFromHtml(url, html, config);
2055
2254
  if (config.debug?.saveDir && config.debug?.enabled) {
2056
2255
  try {
@@ -2065,7 +2264,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2065
2264
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
2066
2265
  }
2067
2266
  }
2068
- return { doc, bodyTextLengthHint };
2267
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
2069
2268
  } catch (e) {
2070
2269
  const msg = String(e?.message || e || "render_failed");
2071
2270
  const lower = msg.toLowerCase();
@@ -2157,14 +2356,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
2157
2356
  /**
2158
2357
  * Clean extracted text content
2159
2358
  */
2160
- cleanContent(text) {
2161
- return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
2162
- }
2163
- /**
2164
- * Convert URL to a stable document ID
2165
- */
2166
2359
  urlToId(url) {
2167
- return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
2360
+ return urlToDocumentId(url);
2168
2361
  }
2169
2362
  /**
2170
2363
  * Delay helper
@@ -2434,5 +2627,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
2434
2627
  };
2435
2628
  // Annotate the CommonJS export names for ESM import in node:
2436
2629
  0 && (module.exports = {
2437
- WebRAGPlugin
2630
+ WebRAGPlugin,
2631
+ bodyTextLengthHint,
2632
+ extractPageFromHtml,
2633
+ extractProductMetadata,
2634
+ normalizeAvailability,
2635
+ normalizeCurrency,
2636
+ parsePrice,
2637
+ urlToDocumentId
2438
2638
  });
package/dist/index.mjs CHANGED
@@ -1,9 +1,305 @@
1
1
  // src/WebRAGPlugin.ts
2
2
  import { MongoClient } from "mongodb";
3
3
  import OpenAI from "openai";
4
- import * as cheerio from "cheerio";
4
+ import * as cheerio3 from "cheerio";
5
5
  import * as fs from "fs";
6
6
  import * as path from "path";
7
+
8
+ // src/htmlPageExtract.ts
9
+ import * as cheerio2 from "cheerio";
10
+
11
+ // src/productMetadata.ts
12
+ import * as cheerio from "cheerio";
13
+ function extractProductMetadata(html) {
14
+ const $ = cheerio.load(html);
15
+ const fromJsonLd = extractFromJsonLd($);
16
+ const fromOg = extractFromOpenGraph($);
17
+ const fromMicrodata = extractFromMicrodata($);
18
+ const result = {};
19
+ const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
20
+ if (price != null) result.price = price;
21
+ const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
22
+ if (currency) result.currency = currency;
23
+ const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
24
+ if (availability) result.availability = availability;
25
+ return result;
26
+ }
27
+ function extractFromJsonLd($) {
28
+ const result = {};
29
+ $('script[type="application/ld+json"]').each((_, el) => {
30
+ if (result.price != null && result.currency && result.availability) return false;
31
+ const raw = $(el).html()?.trim();
32
+ if (!raw) return;
33
+ let parsed;
34
+ try {
35
+ parsed = JSON.parse(raw);
36
+ } catch {
37
+ return;
38
+ }
39
+ for (const node of collectJsonLdNodes(parsed)) {
40
+ if (!isProductType(node)) continue;
41
+ const offer = pickOffer(node);
42
+ if (!offer) continue;
43
+ if (result.price == null) {
44
+ const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
45
+ if (price != null) result.price = price;
46
+ }
47
+ if (!result.currency) {
48
+ const currency = normalizeCurrency(offer.priceCurrency);
49
+ if (currency) result.currency = currency;
50
+ }
51
+ if (!result.availability) {
52
+ const availability = normalizeAvailability(offer.availability);
53
+ if (availability) result.availability = availability;
54
+ }
55
+ }
56
+ });
57
+ return result;
58
+ }
59
+ function extractFromOpenGraph($) {
60
+ const result = {};
61
+ const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
62
+ const price = parsePrice(priceRaw);
63
+ if (price != null) result.price = price;
64
+ const currency = normalizeCurrency(
65
+ $('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
66
+ );
67
+ if (currency) result.currency = currency;
68
+ const availability = normalizeAvailability(
69
+ $('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
70
+ );
71
+ if (availability) result.availability = availability;
72
+ return result;
73
+ }
74
+ function microdataField($, itemprop) {
75
+ const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
76
+ return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
77
+ }
78
+ function extractFromMicrodata($) {
79
+ const result = {};
80
+ const priceEl = microdataField($, "price");
81
+ const price = parsePrice(priceEl.attr("content") || priceEl.text());
82
+ if (price != null) result.price = price;
83
+ const currencyEl = microdataField($, "priceCurrency");
84
+ const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
85
+ if (currency) result.currency = currency;
86
+ const availabilityEl = microdataField($, "availability");
87
+ const availability = normalizeAvailability(
88
+ availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
89
+ );
90
+ if (availability) result.availability = availability;
91
+ return result;
92
+ }
93
+ function collectJsonLdNodes(data) {
94
+ const nodes = [];
95
+ const visit = (value) => {
96
+ if (value == null) return;
97
+ if (Array.isArray(value)) {
98
+ value.forEach(visit);
99
+ return;
100
+ }
101
+ if (typeof value !== "object") return;
102
+ const obj = value;
103
+ nodes.push(obj);
104
+ if (obj["@graph"]) visit(obj["@graph"]);
105
+ };
106
+ visit(data);
107
+ return nodes;
108
+ }
109
+ function isProductType(node) {
110
+ const type = node["@type"];
111
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
112
+ return types.some((t) => {
113
+ const s = String(t).toLowerCase();
114
+ return s === "product" || s.endsWith("/product");
115
+ });
116
+ }
117
+ function pickOffer(product) {
118
+ const offers = product.offers;
119
+ if (offers == null) return null;
120
+ if (Array.isArray(offers)) {
121
+ const first = offers.find((o) => o && typeof o === "object");
122
+ return first ?? null;
123
+ }
124
+ if (typeof offers === "object") return offers;
125
+ return null;
126
+ }
127
+ function parsePrice(value) {
128
+ if (value == null || value === "") return void 0;
129
+ if (typeof value === "number" && Number.isFinite(value)) return value;
130
+ let s = String(value).trim();
131
+ if (!s) return void 0;
132
+ s = s.replace(/[^\d.,\-]/g, "");
133
+ if (!s || s === "-" || s === ".") return void 0;
134
+ const lastComma = s.lastIndexOf(",");
135
+ const lastDot = s.lastIndexOf(".");
136
+ if (lastComma > -1 && lastDot > -1) {
137
+ if (lastComma > lastDot) {
138
+ s = s.replace(/\./g, "").replace(",", ".");
139
+ } else {
140
+ s = s.replace(/,/g, "");
141
+ }
142
+ } else if (lastComma > -1) {
143
+ const parts = s.split(",");
144
+ if (parts.length === 2 && parts[1].length <= 2) {
145
+ s = parts[0].replace(/\./g, "") + "." + parts[1];
146
+ } else {
147
+ s = s.replace(/,/g, "");
148
+ }
149
+ }
150
+ const num = parseFloat(s);
151
+ return Number.isFinite(num) ? num : void 0;
152
+ }
153
+ function normalizeCurrency(value) {
154
+ if (value == null) return void 0;
155
+ const s = String(value).trim().toUpperCase();
156
+ if (!s) return void 0;
157
+ const iso = s.match(/[A-Z]{3}/);
158
+ return iso ? iso[0] : s.length <= 4 ? s : void 0;
159
+ }
160
+ function normalizeAvailability(value) {
161
+ if (value == null) return void 0;
162
+ let s = String(value).trim();
163
+ if (!s) return void 0;
164
+ if (s.includes("schema.org/")) {
165
+ const parts = s.split("/");
166
+ s = parts[parts.length - 1] || s;
167
+ }
168
+ s = s.replace(/^https?:\/\/[^/]+\//, "");
169
+ if (s.includes("/")) {
170
+ const parts = s.split("/");
171
+ s = parts[parts.length - 1] || s;
172
+ }
173
+ return s.replace(/\s+/g, "") || void 0;
174
+ }
175
+
176
+ // src/htmlPageExtract.ts
177
+ var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
178
+ var DEFAULT_REMOVE_SELECTORS = [
179
+ "script",
180
+ "style",
181
+ "nav",
182
+ "header",
183
+ "footer",
184
+ ".sidebar",
185
+ ".navigation",
186
+ ".menu",
187
+ ".comments",
188
+ '[role="navigation"]',
189
+ '[role="banner"]'
190
+ ];
191
+ function urlToDocumentId(url) {
192
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
193
+ }
194
+ function cleanContent(text) {
195
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
196
+ }
197
+ function bodyTextLengthHint(html, options = {}) {
198
+ const $ = cheerio2.load(html);
199
+ stripNoiseFromDom($, options);
200
+ return cleanContent($("body").text().trim()).length;
201
+ }
202
+ function extractPageFromHtml(url, html, options = {}) {
203
+ const $ = cheerio2.load(html);
204
+ stripNoiseFromDom($, options);
205
+ const titleSelector = options.titleSelector || "h1, title";
206
+ let title = $(titleSelector).first().text().trim();
207
+ if (!title) {
208
+ title = $("title").text().trim();
209
+ }
210
+ const content = extractBestContentText($, options);
211
+ const minChars = options.minExtractedContentLength ?? 50;
212
+ const indexable = Boolean(content && content.length >= minChars);
213
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
214
+ let imageUrl;
215
+ if (image) {
216
+ try {
217
+ imageUrl = new URL(image, url).href;
218
+ } catch {
219
+ imageUrl = image;
220
+ }
221
+ }
222
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
223
+ let type = options.defaultType || "page";
224
+ if (options.typeFromUrl) {
225
+ for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
226
+ if (url.includes(pattern)) {
227
+ type = typeName;
228
+ break;
229
+ }
230
+ }
231
+ }
232
+ const productMeta = extractProductMetadata(html);
233
+ const metadata = {
234
+ type,
235
+ ...title ? { title } : {},
236
+ url,
237
+ ...imageUrl ? { imageUrl } : {},
238
+ ...description ? { description } : {},
239
+ ...productMeta.price != null ? { price: productMeta.price } : {},
240
+ ...productMeta.currency ? { currency: productMeta.currency } : {},
241
+ ...productMeta.availability ? { availability: productMeta.availability } : {},
242
+ ...options.metadata
243
+ };
244
+ const previewLen = 400;
245
+ const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
246
+ return {
247
+ id: urlToDocumentId(url),
248
+ metadata,
249
+ content,
250
+ indexable,
251
+ contentPreview
252
+ };
253
+ }
254
+ function stripNoiseFromDom($, options) {
255
+ const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
256
+ removeSelectors.forEach((selector) => $(selector).remove());
257
+ }
258
+ function extractBestContentText($, options) {
259
+ const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
260
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
261
+ let best = "";
262
+ for (const sel of selectors) {
263
+ $(sel).each((_, el) => {
264
+ const t = cleanContent($(el).text().trim());
265
+ if (t.length > best.length) best = t;
266
+ });
267
+ }
268
+ const bodyText = cleanContent($("body").text().trim());
269
+ if (bodyText.length > best.length) best = bodyText;
270
+ return best;
271
+ }
272
+ function extractHeroImage($, pageUrl) {
273
+ const containers = $('main, article, [role="main"], #content, .content');
274
+ const scope = containers.length > 0 ? containers : $("body");
275
+ let best;
276
+ scope.find("img[src]").each((_, el) => {
277
+ if (best) return false;
278
+ const src = $(el).attr("src") || "";
279
+ const alt = ($(el).attr("alt") || "").toLowerCase();
280
+ const width = parseInt($(el).attr("width") || "0", 10);
281
+ const height = parseInt($(el).attr("height") || "0", 10);
282
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
283
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
284
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
285
+ if (src.includes("/_next/image")) {
286
+ try {
287
+ const nextUrl = new URL(src, pageUrl);
288
+ const realUrl = nextUrl.searchParams.get("url");
289
+ if (realUrl) {
290
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
291
+ return false;
292
+ }
293
+ } catch {
294
+ }
295
+ }
296
+ best = src;
297
+ return false;
298
+ });
299
+ return best;
300
+ }
301
+
302
+ // src/WebRAGPlugin.ts
7
303
  function bulkOpCurrentUrl(op) {
8
304
  const meta = op.document?.metadata;
9
305
  if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
@@ -22,7 +318,7 @@ function isUrlListingInsert(document) {
22
318
  return false;
23
319
  }
24
320
  }
25
- var WebRAGPlugin = class _WebRAGPlugin {
321
+ var WebRAGPlugin = class {
26
322
  name = "web-rag";
27
323
  type = "rag";
28
324
  priority;
@@ -252,6 +548,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
252
548
  url: doc.metadata.url,
253
549
  imageUrl: doc.metadata.imageUrl,
254
550
  description: doc.metadata.description,
551
+ ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
552
+ ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
553
+ ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
255
554
  score: doc.score
256
555
  }))
257
556
  }
@@ -1422,7 +1721,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1422
1721
  return await response.text();
1423
1722
  }
1424
1723
  extractInternalLinks(html, base, stripQueryParams) {
1425
- const $ = cheerio.load(html);
1724
+ const $ = cheerio3.load(html);
1426
1725
  const links = /* @__PURE__ */ new Set();
1427
1726
  $("a[href]").each((_, el) => {
1428
1727
  const href = ($(el).attr("href") || "").trim();
@@ -1588,7 +1887,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1588
1887
  }
1589
1888
  }
1590
1889
  try {
1591
- const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
1890
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
1592
1891
  renderMode,
1593
1892
  renderOptions,
1594
1893
  minContentLength,
@@ -1619,7 +1918,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1619
1918
  status: crawlSt,
1620
1919
  modeUsed: diag?.modeUsed,
1621
1920
  contentLength: doc?.content?.length,
1622
- bodyTextLengthHint,
1921
+ bodyTextLengthHint: bodyTextLengthHint2,
1623
1922
  title: doc?.metadata?.title,
1624
1923
  docId: doc?.id,
1625
1924
  error: diag?.errorMessage
@@ -1731,125 +2030,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
1731
2030
  const html = await response.text();
1732
2031
  return this.extractDocumentFromHtml(url, html, config);
1733
2032
  }
1734
- /**
1735
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
1736
- * would otherwise hit an empty wrapper.
1737
- */
1738
- static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1739
- stripNoiseFromDom($, config) {
1740
- const removeSelectors = config.removeSelectors || [
1741
- "script",
1742
- "style",
1743
- "nav",
1744
- "header",
1745
- "footer",
1746
- ".sidebar",
1747
- ".navigation",
1748
- ".menu",
1749
- ".comments",
1750
- '[role="navigation"]',
1751
- '[role="banner"]'
1752
- ];
1753
- removeSelectors.forEach((selector) => $(selector).remove());
1754
- }
1755
- /** Longest cleaned text among selector matches and full body (after noise strip). */
1756
- extractBestContentText($, config) {
1757
- const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1758
- const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1759
- let best = "";
1760
- for (const sel of selectors) {
1761
- $(sel).each((_, el) => {
1762
- const t = this.cleanContent($(el).text().trim());
1763
- if (t.length > best.length) best = t;
1764
- });
1765
- }
1766
- const bodyText = this.cleanContent($("body").text().trim());
1767
- if (bodyText.length > best.length) best = bodyText;
1768
- return best;
1769
- }
1770
2033
  bodyTextLengthHint(html, config) {
1771
- const $ = cheerio.load(html);
1772
- this.stripNoiseFromDom($, config);
1773
- return this.cleanContent($("body").text().trim()).length;
2034
+ return bodyTextLengthHint(html, config);
1774
2035
  }
1775
2036
  extractDocumentFromHtml(url, html, config) {
1776
- const $ = cheerio.load(html);
1777
- this.stripNoiseFromDom($, config);
1778
- const titleSelector = config.titleSelector || "h1, title";
1779
- let title = $(titleSelector).first().text().trim();
1780
- if (!title) {
1781
- title = $("title").text().trim();
1782
- }
1783
- const content = this.extractBestContentText($, config);
1784
- const minChars = config.minExtractedContentLength ?? 50;
1785
- if (!content || content.length < minChars) return null;
1786
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1787
- this.extractHeroImage($, url) || void 0;
1788
- let imageUrl;
1789
- if (image) {
1790
- try {
1791
- imageUrl = new URL(image, url).href;
1792
- } catch {
1793
- imageUrl = image;
1794
- }
1795
- }
1796
- const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1797
- let type = config.defaultType || "page";
1798
- if (config.typeFromUrl) {
1799
- for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1800
- if (url.includes(pattern)) {
1801
- type = typeName;
1802
- break;
1803
- }
1804
- }
1805
- }
1806
- const id = this.urlToId(url);
2037
+ const extracted = extractPageFromHtml(url, html, config);
2038
+ if (!extracted.indexable) return null;
1807
2039
  return {
1808
- id,
1809
- content,
1810
- metadata: {
1811
- type,
1812
- title,
1813
- url,
1814
- ...imageUrl ? { imageUrl } : {},
1815
- ...description ? { description } : {},
1816
- ...config.metadata
1817
- }
2040
+ id: extracted.id,
2041
+ content: extracted.content,
2042
+ metadata: extracted.metadata
1818
2043
  };
1819
2044
  }
1820
- /**
1821
- * Fallback image extraction: finds the first meaningful image in the content area.
1822
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
1823
- */
1824
- extractHeroImage($, pageUrl) {
1825
- const containers = $('main, article, [role="main"], #content, .content');
1826
- const scope = containers.length > 0 ? containers : $("body");
1827
- let best;
1828
- scope.find("img[src]").each((_, el) => {
1829
- if (best) return false;
1830
- const src = $(el).attr("src") || "";
1831
- const alt = ($(el).attr("alt") || "").toLowerCase();
1832
- const width = parseInt($(el).attr("width") || "0", 10);
1833
- const height = parseInt($(el).attr("height") || "0", 10);
1834
- if (width > 0 && width < 80 || height > 0 && height < 80) return;
1835
- if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1836
- if (src.startsWith("data:") || src.endsWith(".svg")) return;
1837
- if (src.includes("/_next/image")) {
1838
- try {
1839
- const nextUrl = new URL(src, pageUrl);
1840
- const realUrl = nextUrl.searchParams.get("url");
1841
- if (realUrl) {
1842
- best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1843
- return false;
1844
- }
1845
- } catch {
1846
- }
1847
- }
1848
- best = src;
1849
- return false;
1850
- });
1851
- return best;
1852
- }
1853
2045
  looksLikeDynamicShell(html) {
1854
2046
  const lower = html.toLowerCase();
1855
2047
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1867,7 +2059,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1867
2059
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1868
2060
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1869
2061
  }
1870
- diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
2062
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
1871
2063
  if (blockedSuspected) {
1872
2064
  return {
1873
2065
  doc: null,
@@ -1883,12 +2075,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1883
2075
  return {
1884
2076
  doc,
1885
2077
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1886
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
2078
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
1887
2079
  };
1888
2080
  }
1889
2081
  async crawlPageSmart(url, config, timeout, ctx) {
1890
2082
  if (ctx.renderMode === true) {
1891
- const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2083
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1892
2084
  url,
1893
2085
  config,
1894
2086
  timeout,
@@ -1897,7 +2089,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1897
2089
  );
1898
2090
  return this.diagFromRenderedAttempt(
1899
2091
  doc,
1900
- bodyTextLengthHint,
2092
+ bodyTextLengthHint2,
1901
2093
  renderFailure,
1902
2094
  blockedSuspected,
1903
2095
  "render_ok",
@@ -2014,7 +2206,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2014
2206
  }
2015
2207
  }
2016
2208
  const html = await page.content();
2017
- const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
2209
+ const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
2018
2210
  const doc = this.extractDocumentFromHtml(url, html, config);
2019
2211
  if (config.debug?.saveDir && config.debug?.enabled) {
2020
2212
  try {
@@ -2029,7 +2221,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2029
2221
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
2030
2222
  }
2031
2223
  }
2032
- return { doc, bodyTextLengthHint };
2224
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
2033
2225
  } catch (e) {
2034
2226
  const msg = String(e?.message || e || "render_failed");
2035
2227
  const lower = msg.toLowerCase();
@@ -2121,14 +2313,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
2121
2313
  /**
2122
2314
  * Clean extracted text content
2123
2315
  */
2124
- cleanContent(text) {
2125
- return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
2126
- }
2127
- /**
2128
- * Convert URL to a stable document ID
2129
- */
2130
2316
  urlToId(url) {
2131
- return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
2317
+ return urlToDocumentId(url);
2132
2318
  }
2133
2319
  /**
2134
2320
  * Delay helper
@@ -2397,5 +2583,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
2397
2583
  }
2398
2584
  };
2399
2585
  export {
2400
- WebRAGPlugin
2586
+ WebRAGPlugin,
2587
+ bodyTextLengthHint,
2588
+ extractPageFromHtml,
2589
+ extractProductMetadata,
2590
+ normalizeAvailability,
2591
+ normalizeCurrency,
2592
+ parsePrice,
2593
+ urlToDocumentId
2401
2594
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.5",
3
+ "version": "0.1.6",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",