@snap-agent/rag-web 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -30,16 +30,566 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
- WebRAGPlugin: () => WebRAGPlugin
33
+ WebRAGPlugin: () => WebRAGPlugin,
34
+ bodyTextLengthHint: () => bodyTextLengthHint,
35
+ extractPageFromHtml: () => extractPageFromHtml,
36
+ extractProductMetadata: () => extractProductMetadata,
37
+ hardExcludePage: () => hardExcludePage,
38
+ inferTypeFromUrl: () => inferTypeFromUrl,
39
+ normalizeAvailability: () => normalizeAvailability,
40
+ normalizeCurrency: () => normalizeCurrency,
41
+ normalizeDisplayTitle: () => normalizeDisplayTitle,
42
+ parsePrice: () => parsePrice,
43
+ resolvePageCardMetadata: () => resolvePageCardMetadata,
44
+ urlToDocumentId: () => urlToDocumentId
34
45
  });
35
46
  module.exports = __toCommonJS(index_exports);
36
47
 
37
48
  // src/WebRAGPlugin.ts
38
49
  var import_mongodb = require("mongodb");
39
50
  var import_openai = __toESM(require("openai"));
40
- var cheerio = __toESM(require("cheerio"));
51
+ var cheerio4 = __toESM(require("cheerio"));
41
52
  var fs = __toESM(require("fs"));
42
53
  var path = __toESM(require("path"));
54
+
55
+ // src/htmlPageExtract.ts
56
+ var cheerio3 = __toESM(require("cheerio"));
57
+
58
+ // src/productMetadata.ts
59
+ var cheerio = __toESM(require("cheerio"));
60
+ function extractProductMetadata(html) {
61
+ const $ = cheerio.load(html);
62
+ const fromJsonLd = extractFromJsonLd($);
63
+ const fromOg = extractFromOpenGraph($);
64
+ const fromMicrodata = extractFromMicrodata($);
65
+ const result = {};
66
+ const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
67
+ if (price != null) result.price = price;
68
+ const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
69
+ if (currency) result.currency = currency;
70
+ const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
71
+ if (availability) result.availability = availability;
72
+ return result;
73
+ }
74
+ function extractFromJsonLd($) {
75
+ const result = {};
76
+ $('script[type="application/ld+json"]').each((_, el) => {
77
+ if (result.price != null && result.currency && result.availability) return false;
78
+ const raw = $(el).html()?.trim();
79
+ if (!raw) return;
80
+ let parsed;
81
+ try {
82
+ parsed = JSON.parse(raw);
83
+ } catch {
84
+ return;
85
+ }
86
+ for (const node of collectJsonLdNodes(parsed)) {
87
+ if (!isProductType(node)) continue;
88
+ const offer = pickOffer(node);
89
+ if (!offer) continue;
90
+ if (result.price == null) {
91
+ const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
92
+ if (price != null) result.price = price;
93
+ }
94
+ if (!result.currency) {
95
+ const currency = normalizeCurrency(offer.priceCurrency);
96
+ if (currency) result.currency = currency;
97
+ }
98
+ if (!result.availability) {
99
+ const availability = normalizeAvailability(offer.availability);
100
+ if (availability) result.availability = availability;
101
+ }
102
+ }
103
+ });
104
+ return result;
105
+ }
106
+ function extractFromOpenGraph($) {
107
+ const result = {};
108
+ const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
109
+ const price = parsePrice(priceRaw);
110
+ if (price != null) result.price = price;
111
+ const currency = normalizeCurrency(
112
+ $('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
113
+ );
114
+ if (currency) result.currency = currency;
115
+ const availability = normalizeAvailability(
116
+ $('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
117
+ );
118
+ if (availability) result.availability = availability;
119
+ return result;
120
+ }
121
+ function microdataField($, itemprop) {
122
+ const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
123
+ return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
124
+ }
125
+ function extractFromMicrodata($) {
126
+ const result = {};
127
+ const priceEl = microdataField($, "price");
128
+ const price = parsePrice(priceEl.attr("content") || priceEl.text());
129
+ if (price != null) result.price = price;
130
+ const currencyEl = microdataField($, "priceCurrency");
131
+ const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
132
+ if (currency) result.currency = currency;
133
+ const availabilityEl = microdataField($, "availability");
134
+ const availability = normalizeAvailability(
135
+ availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
136
+ );
137
+ if (availability) result.availability = availability;
138
+ return result;
139
+ }
140
+ function collectJsonLdNodes(data) {
141
+ const nodes = [];
142
+ const visit = (value) => {
143
+ if (value == null) return;
144
+ if (Array.isArray(value)) {
145
+ value.forEach(visit);
146
+ return;
147
+ }
148
+ if (typeof value !== "object") return;
149
+ const obj = value;
150
+ nodes.push(obj);
151
+ if (obj["@graph"]) visit(obj["@graph"]);
152
+ };
153
+ visit(data);
154
+ return nodes;
155
+ }
156
+ function isProductType(node) {
157
+ const type = node["@type"];
158
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
159
+ return types.some((t) => {
160
+ const s = String(t).toLowerCase();
161
+ return s === "product" || s.endsWith("/product");
162
+ });
163
+ }
164
+ function pickOffer(product) {
165
+ const offers = product.offers;
166
+ if (offers == null) return null;
167
+ if (Array.isArray(offers)) {
168
+ const first = offers.find((o) => o && typeof o === "object");
169
+ return first ?? null;
170
+ }
171
+ if (typeof offers === "object") return offers;
172
+ return null;
173
+ }
174
+ function parsePrice(value) {
175
+ if (value == null || value === "") return void 0;
176
+ if (typeof value === "number" && Number.isFinite(value)) return value;
177
+ let s = String(value).trim();
178
+ if (!s) return void 0;
179
+ s = s.replace(/[^\d.,\-]/g, "");
180
+ if (!s || s === "-" || s === ".") return void 0;
181
+ const lastComma = s.lastIndexOf(",");
182
+ const lastDot = s.lastIndexOf(".");
183
+ if (lastComma > -1 && lastDot > -1) {
184
+ if (lastComma > lastDot) {
185
+ s = s.replace(/\./g, "").replace(",", ".");
186
+ } else {
187
+ s = s.replace(/,/g, "");
188
+ }
189
+ } else if (lastComma > -1) {
190
+ const parts = s.split(",");
191
+ if (parts.length === 2 && parts[1].length <= 2) {
192
+ s = parts[0].replace(/\./g, "") + "." + parts[1];
193
+ } else {
194
+ s = s.replace(/,/g, "");
195
+ }
196
+ }
197
+ const num = parseFloat(s);
198
+ return Number.isFinite(num) ? num : void 0;
199
+ }
200
+ function normalizeCurrency(value) {
201
+ if (value == null) return void 0;
202
+ const s = String(value).trim().toUpperCase();
203
+ if (!s) return void 0;
204
+ const iso = s.match(/[A-Z]{3}/);
205
+ return iso ? iso[0] : s.length <= 4 ? s : void 0;
206
+ }
207
+ function normalizeAvailability(value) {
208
+ if (value == null) return void 0;
209
+ let s = String(value).trim();
210
+ if (!s) return void 0;
211
+ if (s.includes("schema.org/")) {
212
+ const parts = s.split("/");
213
+ s = parts[parts.length - 1] || s;
214
+ }
215
+ s = s.replace(/^https?:\/\/[^/]+\//, "");
216
+ if (s.includes("/")) {
217
+ const parts = s.split("/");
218
+ s = parts[parts.length - 1] || s;
219
+ }
220
+ return s.replace(/\s+/g, "") || void 0;
221
+ }
222
+
223
+ // src/pageCardMetadata.ts
224
+ var cheerio2 = __toESM(require("cheerio"));
225
+ var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
226
+ var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
227
+ var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
228
+ var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
229
+ var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
230
+ var ENTITY_DETAIL_PATH_RES = [
231
+ /\/projects\/[^/]+/i,
232
+ /\/project\/[^/]+/i,
233
+ /\/perspectives\/[^/]+/i,
234
+ /\/perspective\/[^/]+/i,
235
+ /\/portfolio\/[^/]+/i,
236
+ /\/case-stud(?:y|ies)\/[^/]+/i,
237
+ /\/insights?\/[^/]+/i,
238
+ /\/people\/[^/]+/i,
239
+ /\/person\/[^/]+/i,
240
+ /\/team-members?\/[^/]+/i,
241
+ /\/members?\/[^/]+/i,
242
+ /\/staff\/[^/]+/i,
243
+ /\/experts?\/[^/]+/i,
244
+ /\/authors?\/[^/]+/i,
245
+ /\/leadership\/[^/]+/i,
246
+ /\/biograph(?:y|ies)\/[^/]+/i
247
+ ];
248
+ var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
249
+ var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
250
+ var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
251
+ var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
252
+ var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
253
+ var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
254
+ var CARD_PRIORITY = {
255
+ detail: 10,
256
+ listing: 6,
257
+ amenity: 5,
258
+ promotion: 2,
259
+ contact: 1,
260
+ content: 1,
261
+ blog: 0,
262
+ system: 0,
263
+ page: 3
264
+ };
265
+ var CARD_ELIGIBLE_DEFAULT = {
266
+ detail: true,
267
+ listing: true,
268
+ amenity: true,
269
+ promotion: false,
270
+ contact: false,
271
+ content: false,
272
+ blog: false,
273
+ system: false,
274
+ page: false
275
+ };
276
+ var SCHEMA_TYPE_MAP = {
277
+ product: "detail",
278
+ service: "amenity",
279
+ hotelroom: "detail",
280
+ room: "detail",
281
+ apartment: "detail",
282
+ lodgingroom: "detail",
283
+ course: "detail",
284
+ event: "detail",
285
+ offer: "promotion",
286
+ person: "detail",
287
+ employee: "detail",
288
+ profilepage: "detail",
289
+ article: "detail",
290
+ newsarticle: "detail",
291
+ blogposting: "detail",
292
+ creativework: "detail"
293
+ };
294
+ function normalizeDisplayTitle(title) {
295
+ if (!title?.trim()) return title;
296
+ let t = title.trim();
297
+ for (let i = 0; i < 2; i++) {
298
+ const dash = t.match(EN_DASH_SUFFIX_RE);
299
+ if (dash && dash.index !== void 0 && dash.index >= 4) {
300
+ t = t.slice(0, dash.index).trim();
301
+ continue;
302
+ }
303
+ const pipe = t.match(PIPE_SUFFIX_RE);
304
+ if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
305
+ t = t.slice(0, pipe.index).trim();
306
+ continue;
307
+ }
308
+ break;
309
+ }
310
+ return t || title.trim();
311
+ }
312
+ function hardExcludePage(url, title) {
313
+ const path2 = url.toLowerCase();
314
+ if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
315
+ if (BLOG_URL_RE.test(path2)) return true;
316
+ if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
317
+ try {
318
+ const u = new URL(url);
319
+ if (u.pathname === "/" || u.pathname === "") return true;
320
+ } catch {
321
+ }
322
+ return false;
323
+ }
324
+ function inferTypeFromUrl(url) {
325
+ const path2 = url.toLowerCase();
326
+ if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
327
+ if (CONTACT_URL_RE.test(path2)) return "contact";
328
+ if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
329
+ if (AMENITY_URL_RE.test(path2)) return "amenity";
330
+ if (DETAIL_URL_RE.test(path2)) return "detail";
331
+ if (LISTING_URL_RE.test(path2)) return "listing";
332
+ if (BLOG_URL_RE.test(path2)) return "blog";
333
+ return void 0;
334
+ }
335
+ function collectJsonLdNodes2(data) {
336
+ const nodes = [];
337
+ const visit = (value) => {
338
+ if (value == null) return;
339
+ if (Array.isArray(value)) {
340
+ value.forEach(visit);
341
+ return;
342
+ }
343
+ if (typeof value !== "object") return;
344
+ const obj = value;
345
+ nodes.push(obj);
346
+ if (obj["@graph"]) visit(obj["@graph"]);
347
+ };
348
+ visit(data);
349
+ return nodes;
350
+ }
351
+ function schemaTypeName(node) {
352
+ const type = node["@type"];
353
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
354
+ const raw = types[0];
355
+ if (raw == null) return "";
356
+ const s = String(raw).toLowerCase();
357
+ const slash = s.lastIndexOf("/");
358
+ return slash >= 0 ? s.slice(slash + 1) : s;
359
+ }
360
+ function inferTypeFromSchema(html) {
361
+ const $ = cheerio2.load(html);
362
+ for (const el of $('script[type="application/ld+json"]').toArray()) {
363
+ const raw = $(el).html()?.trim();
364
+ if (!raw) continue;
365
+ try {
366
+ const parsed = JSON.parse(raw);
367
+ for (const node of collectJsonLdNodes2(parsed)) {
368
+ const name = schemaTypeName(node);
369
+ if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
370
+ if (name === "product" || node.offers != null) return "detail";
371
+ }
372
+ } catch {
373
+ }
374
+ }
375
+ const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
376
+ if (ogType === "product") return "detail";
377
+ return void 0;
378
+ }
379
+ function normalizePageType(raw) {
380
+ if (!raw) return "page";
381
+ const lower = raw.toLowerCase();
382
+ const known = [
383
+ "detail",
384
+ "listing",
385
+ "amenity",
386
+ "promotion",
387
+ "contact",
388
+ "content",
389
+ "blog",
390
+ "system",
391
+ "page"
392
+ ];
393
+ if (known.includes(lower)) return lower;
394
+ if (lower === "room" || lower === "product") return "detail";
395
+ if (lower === "offer" || lower === "sale") return "promotion";
396
+ return raw;
397
+ }
398
+ function resolveDisplayTitle(input) {
399
+ const heading = input.headingTitle?.trim();
400
+ if (heading) return normalizeDisplayTitle(heading);
401
+ return normalizeDisplayTitle(input.title);
402
+ }
403
+ function resolvePageCardMetadata(input) {
404
+ const title = input.title?.trim();
405
+ const url = input.url;
406
+ const displayTitle = resolveDisplayTitle(input);
407
+ if (hardExcludePage(url, title)) {
408
+ return {
409
+ type: "system",
410
+ cardEligible: false,
411
+ cardPriority: 0,
412
+ displayTitle,
413
+ displayDescription: input.description,
414
+ displayImageUrl: input.imageUrl
415
+ };
416
+ }
417
+ let type = normalizePageType(input.type);
418
+ if (type === "page" && input.html) {
419
+ const fromSchema = inferTypeFromSchema(input.html);
420
+ if (fromSchema) type = fromSchema;
421
+ }
422
+ if (type === "page") {
423
+ const fromUrl = inferTypeFromUrl(url);
424
+ if (fromUrl) type = fromUrl;
425
+ }
426
+ if (input.hasProductPrice && type === "page") {
427
+ type = "detail";
428
+ }
429
+ const typeKey = String(type).toLowerCase();
430
+ let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
431
+ let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
432
+ if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
433
+ cardEligible = false;
434
+ }
435
+ return {
436
+ type,
437
+ cardEligible,
438
+ cardPriority,
439
+ displayTitle,
440
+ displayDescription: input.description?.trim() || void 0,
441
+ displayImageUrl: input.imageUrl
442
+ };
443
+ }
444
+
445
+ // src/htmlPageExtract.ts
446
+ var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
447
+ var DEFAULT_REMOVE_SELECTORS = [
448
+ "script",
449
+ "style",
450
+ "nav",
451
+ "header",
452
+ "footer",
453
+ ".sidebar",
454
+ ".navigation",
455
+ ".menu",
456
+ ".comments",
457
+ '[role="navigation"]',
458
+ '[role="banner"]'
459
+ ];
460
+ function urlToDocumentId(url) {
461
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
462
+ }
463
+ function cleanContent(text) {
464
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
465
+ }
466
+ function bodyTextLengthHint(html, options = {}) {
467
+ const $ = cheerio3.load(html);
468
+ stripNoiseFromDom($, options);
469
+ return cleanContent($("body").text().trim()).length;
470
+ }
471
+ function extractPageFromHtml(url, html, options = {}) {
472
+ const $ = cheerio3.load(html);
473
+ stripNoiseFromDom($, options);
474
+ const h1Title = $("h1").first().text().trim();
475
+ const docTitle = $("title").text().trim();
476
+ let title = "";
477
+ if (options.titleSelector) {
478
+ title = $(options.titleSelector).first().text().trim();
479
+ } else {
480
+ title = docTitle || h1Title;
481
+ }
482
+ if (!title) {
483
+ title = h1Title || docTitle;
484
+ }
485
+ const content = extractBestContentText($, options);
486
+ const minChars = options.minExtractedContentLength ?? 50;
487
+ const indexable = Boolean(content && content.length >= minChars);
488
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
489
+ let imageUrl;
490
+ if (image) {
491
+ try {
492
+ imageUrl = new URL(image, url).href;
493
+ } catch {
494
+ imageUrl = image;
495
+ }
496
+ }
497
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
498
+ let type = options.defaultType || "page";
499
+ if (options.typeFromUrl) {
500
+ for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
501
+ if (url.includes(pattern)) {
502
+ type = typeName;
503
+ break;
504
+ }
505
+ }
506
+ }
507
+ const productMeta = extractProductMetadata(html);
508
+ const cardMeta = resolvePageCardMetadata({
509
+ url,
510
+ title,
511
+ headingTitle: h1Title || void 0,
512
+ description,
513
+ imageUrl,
514
+ html,
515
+ type,
516
+ hasProductPrice: productMeta.price != null
517
+ });
518
+ const metadata = {
519
+ type: cardMeta.type,
520
+ cardEligible: cardMeta.cardEligible,
521
+ cardPriority: cardMeta.cardPriority,
522
+ ...title ? { title } : {},
523
+ ...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
524
+ url,
525
+ ...imageUrl ? { imageUrl } : {},
526
+ ...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
527
+ ...description ? { description } : {},
528
+ ...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
529
+ ...productMeta.price != null ? { price: productMeta.price } : {},
530
+ ...productMeta.currency ? { currency: productMeta.currency } : {},
531
+ ...productMeta.availability ? { availability: productMeta.availability } : {},
532
+ ...options.metadata
533
+ };
534
+ const previewLen = 400;
535
+ const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
536
+ return {
537
+ id: urlToDocumentId(url),
538
+ metadata,
539
+ content,
540
+ indexable,
541
+ contentPreview
542
+ };
543
+ }
544
+ function stripNoiseFromDom($, options) {
545
+ const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
546
+ removeSelectors.forEach((selector) => $(selector).remove());
547
+ }
548
+ function extractBestContentText($, options) {
549
+ const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
550
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
551
+ let best = "";
552
+ for (const sel of selectors) {
553
+ $(sel).each((_, el) => {
554
+ const t = cleanContent($(el).text().trim());
555
+ if (t.length > best.length) best = t;
556
+ });
557
+ }
558
+ const bodyText = cleanContent($("body").text().trim());
559
+ if (bodyText.length > best.length) best = bodyText;
560
+ return best;
561
+ }
562
+ function extractHeroImage($, pageUrl) {
563
+ const containers = $('main, article, [role="main"], #content, .content');
564
+ const scope = containers.length > 0 ? containers : $("body");
565
+ let best;
566
+ scope.find("img[src]").each((_, el) => {
567
+ if (best) return false;
568
+ const src = $(el).attr("src") || "";
569
+ const alt = ($(el).attr("alt") || "").toLowerCase();
570
+ const width = parseInt($(el).attr("width") || "0", 10);
571
+ const height = parseInt($(el).attr("height") || "0", 10);
572
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
573
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
574
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
575
+ if (src.includes("/_next/image")) {
576
+ try {
577
+ const nextUrl = new URL(src, pageUrl);
578
+ const realUrl = nextUrl.searchParams.get("url");
579
+ if (realUrl) {
580
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
581
+ return false;
582
+ }
583
+ } catch {
584
+ }
585
+ }
586
+ best = src;
587
+ return false;
588
+ });
589
+ return best;
590
+ }
591
+
592
+ // src/WebRAGPlugin.ts
43
593
  function bulkOpCurrentUrl(op) {
44
594
  const meta = op.document?.metadata;
45
595
  if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
@@ -58,7 +608,7 @@ function isUrlListingInsert(document) {
58
608
  return false;
59
609
  }
60
610
  }
61
- var WebRAGPlugin = class _WebRAGPlugin {
611
+ var WebRAGPlugin = class {
62
612
  name = "web-rag";
63
613
  type = "rag";
64
614
  priority;
@@ -281,13 +831,21 @@ var WebRAGPlugin = class _WebRAGPlugin {
281
831
  plugin: this.name,
282
832
  contentCount: scoredResults.length,
283
833
  types: [...new Set(scoredResults.map((d) => d.metadata.type))],
284
- topResults: scoredResults.slice(0, 5).map((doc) => ({
834
+ topResults: scoredResults.slice(0, 16).map((doc) => ({
285
835
  id: doc.id,
286
836
  type: doc.metadata.type,
287
837
  title: doc.metadata.title,
288
838
  url: doc.metadata.url,
289
839
  imageUrl: doc.metadata.imageUrl,
290
840
  description: doc.metadata.description,
841
+ cardEligible: doc.metadata.cardEligible,
842
+ cardPriority: doc.metadata.cardPriority,
843
+ displayTitle: doc.metadata.displayTitle,
844
+ displayDescription: doc.metadata.displayDescription,
845
+ displayImageUrl: doc.metadata.displayImageUrl,
846
+ ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
847
+ ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
848
+ ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
291
849
  score: doc.score
292
850
  }))
293
851
  }
@@ -1458,7 +2016,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1458
2016
  return await response.text();
1459
2017
  }
1460
2018
  extractInternalLinks(html, base, stripQueryParams) {
1461
- const $ = cheerio.load(html);
2019
+ const $ = cheerio4.load(html);
1462
2020
  const links = /* @__PURE__ */ new Set();
1463
2021
  $("a[href]").each((_, el) => {
1464
2022
  const href = ($(el).attr("href") || "").trim();
@@ -1624,7 +2182,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1624
2182
  }
1625
2183
  }
1626
2184
  try {
1627
- const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
2185
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
1628
2186
  renderMode,
1629
2187
  renderOptions,
1630
2188
  minContentLength,
@@ -1655,7 +2213,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1655
2213
  status: crawlSt,
1656
2214
  modeUsed: diag?.modeUsed,
1657
2215
  contentLength: doc?.content?.length,
1658
- bodyTextLengthHint,
2216
+ bodyTextLengthHint: bodyTextLengthHint2,
1659
2217
  title: doc?.metadata?.title,
1660
2218
  docId: doc?.id,
1661
2219
  error: diag?.errorMessage
@@ -1767,125 +2325,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
1767
2325
  const html = await response.text();
1768
2326
  return this.extractDocumentFromHtml(url, html, config);
1769
2327
  }
1770
- /**
1771
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
1772
- * would otherwise hit an empty wrapper.
1773
- */
1774
- static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1775
- stripNoiseFromDom($, config) {
1776
- const removeSelectors = config.removeSelectors || [
1777
- "script",
1778
- "style",
1779
- "nav",
1780
- "header",
1781
- "footer",
1782
- ".sidebar",
1783
- ".navigation",
1784
- ".menu",
1785
- ".comments",
1786
- '[role="navigation"]',
1787
- '[role="banner"]'
1788
- ];
1789
- removeSelectors.forEach((selector) => $(selector).remove());
1790
- }
1791
- /** Longest cleaned text among selector matches and full body (after noise strip). */
1792
- extractBestContentText($, config) {
1793
- const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1794
- const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1795
- let best = "";
1796
- for (const sel of selectors) {
1797
- $(sel).each((_, el) => {
1798
- const t = this.cleanContent($(el).text().trim());
1799
- if (t.length > best.length) best = t;
1800
- });
1801
- }
1802
- const bodyText = this.cleanContent($("body").text().trim());
1803
- if (bodyText.length > best.length) best = bodyText;
1804
- return best;
1805
- }
1806
2328
  bodyTextLengthHint(html, config) {
1807
- const $ = cheerio.load(html);
1808
- this.stripNoiseFromDom($, config);
1809
- return this.cleanContent($("body").text().trim()).length;
2329
+ return bodyTextLengthHint(html, config);
1810
2330
  }
1811
2331
  extractDocumentFromHtml(url, html, config) {
1812
- const $ = cheerio.load(html);
1813
- this.stripNoiseFromDom($, config);
1814
- const titleSelector = config.titleSelector || "h1, title";
1815
- let title = $(titleSelector).first().text().trim();
1816
- if (!title) {
1817
- title = $("title").text().trim();
1818
- }
1819
- const content = this.extractBestContentText($, config);
1820
- const minChars = config.minExtractedContentLength ?? 50;
1821
- if (!content || content.length < minChars) return null;
1822
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1823
- this.extractHeroImage($, url) || void 0;
1824
- let imageUrl;
1825
- if (image) {
1826
- try {
1827
- imageUrl = new URL(image, url).href;
1828
- } catch {
1829
- imageUrl = image;
1830
- }
1831
- }
1832
- const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1833
- let type = config.defaultType || "page";
1834
- if (config.typeFromUrl) {
1835
- for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1836
- if (url.includes(pattern)) {
1837
- type = typeName;
1838
- break;
1839
- }
1840
- }
1841
- }
1842
- const id = this.urlToId(url);
2332
+ const extracted = extractPageFromHtml(url, html, config);
2333
+ if (!extracted.indexable) return null;
1843
2334
  return {
1844
- id,
1845
- content,
1846
- metadata: {
1847
- type,
1848
- title,
1849
- url,
1850
- ...imageUrl ? { imageUrl } : {},
1851
- ...description ? { description } : {},
1852
- ...config.metadata
1853
- }
2335
+ id: extracted.id,
2336
+ content: extracted.content,
2337
+ metadata: extracted.metadata
1854
2338
  };
1855
2339
  }
1856
- /**
1857
- * Fallback image extraction: finds the first meaningful image in the content area.
1858
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
1859
- */
1860
- extractHeroImage($, pageUrl) {
1861
- const containers = $('main, article, [role="main"], #content, .content');
1862
- const scope = containers.length > 0 ? containers : $("body");
1863
- let best;
1864
- scope.find("img[src]").each((_, el) => {
1865
- if (best) return false;
1866
- const src = $(el).attr("src") || "";
1867
- const alt = ($(el).attr("alt") || "").toLowerCase();
1868
- const width = parseInt($(el).attr("width") || "0", 10);
1869
- const height = parseInt($(el).attr("height") || "0", 10);
1870
- if (width > 0 && width < 80 || height > 0 && height < 80) return;
1871
- if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1872
- if (src.startsWith("data:") || src.endsWith(".svg")) return;
1873
- if (src.includes("/_next/image")) {
1874
- try {
1875
- const nextUrl = new URL(src, pageUrl);
1876
- const realUrl = nextUrl.searchParams.get("url");
1877
- if (realUrl) {
1878
- best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1879
- return false;
1880
- }
1881
- } catch {
1882
- }
1883
- }
1884
- best = src;
1885
- return false;
1886
- });
1887
- return best;
1888
- }
1889
2340
  looksLikeDynamicShell(html) {
1890
2341
  const lower = html.toLowerCase();
1891
2342
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1903,7 +2354,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1903
2354
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1904
2355
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1905
2356
  }
1906
- diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
2357
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
1907
2358
  if (blockedSuspected) {
1908
2359
  return {
1909
2360
  doc: null,
@@ -1919,12 +2370,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1919
2370
  return {
1920
2371
  doc,
1921
2372
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1922
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
2373
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
1923
2374
  };
1924
2375
  }
1925
2376
  async crawlPageSmart(url, config, timeout, ctx) {
1926
2377
  if (ctx.renderMode === true) {
1927
- const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2378
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1928
2379
  url,
1929
2380
  config,
1930
2381
  timeout,
@@ -1933,7 +2384,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1933
2384
  );
1934
2385
  return this.diagFromRenderedAttempt(
1935
2386
  doc,
1936
- bodyTextLengthHint,
2387
+ bodyTextLengthHint2,
1937
2388
  renderFailure,
1938
2389
  blockedSuspected,
1939
2390
  "render_ok",
@@ -2050,7 +2501,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2050
2501
  }
2051
2502
  }
2052
2503
  const html = await page.content();
2053
- const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
2504
+ const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
2054
2505
  const doc = this.extractDocumentFromHtml(url, html, config);
2055
2506
  if (config.debug?.saveDir && config.debug?.enabled) {
2056
2507
  try {
@@ -2065,7 +2516,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2065
2516
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
2066
2517
  }
2067
2518
  }
2068
- return { doc, bodyTextLengthHint };
2519
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
2069
2520
  } catch (e) {
2070
2521
  const msg = String(e?.message || e || "render_failed");
2071
2522
  const lower = msg.toLowerCase();
@@ -2157,14 +2608,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
2157
2608
  /**
2158
2609
  * Clean extracted text content
2159
2610
  */
2160
- cleanContent(text) {
2161
- return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
2162
- }
2163
- /**
2164
- * Convert URL to a stable document ID
2165
- */
2166
2611
  urlToId(url) {
2167
- return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
2612
+ return urlToDocumentId(url);
2168
2613
  }
2169
2614
  /**
2170
2615
  * Delay helper
@@ -2434,5 +2879,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
2434
2879
  };
2435
2880
  // Annotate the CommonJS export names for ESM import in node:
2436
2881
  0 && (module.exports = {
2437
- WebRAGPlugin
2882
+ WebRAGPlugin,
2883
+ bodyTextLengthHint,
2884
+ extractPageFromHtml,
2885
+ extractProductMetadata,
2886
+ hardExcludePage,
2887
+ inferTypeFromUrl,
2888
+ normalizeAvailability,
2889
+ normalizeCurrency,
2890
+ normalizeDisplayTitle,
2891
+ parsePrice,
2892
+ resolvePageCardMetadata,
2893
+ urlToDocumentId
2438
2894
  });