@snap-agent/rag-web 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,9 +1,548 @@
1
1
  // src/WebRAGPlugin.ts
2
2
  import { MongoClient } from "mongodb";
3
3
  import OpenAI from "openai";
4
- import * as cheerio from "cheerio";
4
+ import * as cheerio4 from "cheerio";
5
5
  import * as fs from "fs";
6
6
  import * as path from "path";
7
+
8
+ // src/htmlPageExtract.ts
9
+ import * as cheerio3 from "cheerio";
10
+
11
+ // src/productMetadata.ts
12
+ import * as cheerio from "cheerio";
13
+ function extractProductMetadata(html) {
14
+ const $ = cheerio.load(html);
15
+ const fromJsonLd = extractFromJsonLd($);
16
+ const fromOg = extractFromOpenGraph($);
17
+ const fromMicrodata = extractFromMicrodata($);
18
+ const result = {};
19
+ const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
20
+ if (price != null) result.price = price;
21
+ const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
22
+ if (currency) result.currency = currency;
23
+ const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
24
+ if (availability) result.availability = availability;
25
+ return result;
26
+ }
27
+ function extractFromJsonLd($) {
28
+ const result = {};
29
+ $('script[type="application/ld+json"]').each((_, el) => {
30
+ if (result.price != null && result.currency && result.availability) return false;
31
+ const raw = $(el).html()?.trim();
32
+ if (!raw) return;
33
+ let parsed;
34
+ try {
35
+ parsed = JSON.parse(raw);
36
+ } catch {
37
+ return;
38
+ }
39
+ for (const node of collectJsonLdNodes(parsed)) {
40
+ if (!isProductType(node)) continue;
41
+ const offer = pickOffer(node);
42
+ if (!offer) continue;
43
+ if (result.price == null) {
44
+ const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
45
+ if (price != null) result.price = price;
46
+ }
47
+ if (!result.currency) {
48
+ const currency = normalizeCurrency(offer.priceCurrency);
49
+ if (currency) result.currency = currency;
50
+ }
51
+ if (!result.availability) {
52
+ const availability = normalizeAvailability(offer.availability);
53
+ if (availability) result.availability = availability;
54
+ }
55
+ }
56
+ });
57
+ return result;
58
+ }
59
+ function extractFromOpenGraph($) {
60
+ const result = {};
61
+ const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
62
+ const price = parsePrice(priceRaw);
63
+ if (price != null) result.price = price;
64
+ const currency = normalizeCurrency(
65
+ $('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
66
+ );
67
+ if (currency) result.currency = currency;
68
+ const availability = normalizeAvailability(
69
+ $('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
70
+ );
71
+ if (availability) result.availability = availability;
72
+ return result;
73
+ }
74
+ function microdataField($, itemprop) {
75
+ const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
76
+ return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
77
+ }
78
+ function extractFromMicrodata($) {
79
+ const result = {};
80
+ const priceEl = microdataField($, "price");
81
+ const price = parsePrice(priceEl.attr("content") || priceEl.text());
82
+ if (price != null) result.price = price;
83
+ const currencyEl = microdataField($, "priceCurrency");
84
+ const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
85
+ if (currency) result.currency = currency;
86
+ const availabilityEl = microdataField($, "availability");
87
+ const availability = normalizeAvailability(
88
+ availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
89
+ );
90
+ if (availability) result.availability = availability;
91
+ return result;
92
+ }
93
+ function collectJsonLdNodes(data) {
94
+ const nodes = [];
95
+ const visit = (value) => {
96
+ if (value == null) return;
97
+ if (Array.isArray(value)) {
98
+ value.forEach(visit);
99
+ return;
100
+ }
101
+ if (typeof value !== "object") return;
102
+ const obj = value;
103
+ nodes.push(obj);
104
+ if (obj["@graph"]) visit(obj["@graph"]);
105
+ };
106
+ visit(data);
107
+ return nodes;
108
+ }
109
+ function isProductType(node) {
110
+ const type = node["@type"];
111
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
112
+ return types.some((t) => {
113
+ const s = String(t).toLowerCase();
114
+ return s === "product" || s.endsWith("/product");
115
+ });
116
+ }
117
+ function pickOffer(product) {
118
+ const offers = product.offers;
119
+ if (offers == null) return null;
120
+ if (Array.isArray(offers)) {
121
+ const first = offers.find((o) => o && typeof o === "object");
122
+ return first ?? null;
123
+ }
124
+ if (typeof offers === "object") return offers;
125
+ return null;
126
+ }
127
+ function parsePrice(value) {
128
+ if (value == null || value === "") return void 0;
129
+ if (typeof value === "number" && Number.isFinite(value)) return value;
130
+ let s = String(value).trim();
131
+ if (!s) return void 0;
132
+ s = s.replace(/[^\d.,\-]/g, "");
133
+ if (!s || s === "-" || s === ".") return void 0;
134
+ const lastComma = s.lastIndexOf(",");
135
+ const lastDot = s.lastIndexOf(".");
136
+ if (lastComma > -1 && lastDot > -1) {
137
+ if (lastComma > lastDot) {
138
+ s = s.replace(/\./g, "").replace(",", ".");
139
+ } else {
140
+ s = s.replace(/,/g, "");
141
+ }
142
+ } else if (lastComma > -1) {
143
+ const parts = s.split(",");
144
+ if (parts.length === 2 && parts[1].length <= 2) {
145
+ s = parts[0].replace(/\./g, "") + "." + parts[1];
146
+ } else {
147
+ s = s.replace(/,/g, "");
148
+ }
149
+ }
150
+ const num = parseFloat(s);
151
+ return Number.isFinite(num) ? num : void 0;
152
+ }
153
+ function normalizeCurrency(value) {
154
+ if (value == null) return void 0;
155
+ const s = String(value).trim().toUpperCase();
156
+ if (!s) return void 0;
157
+ const iso = s.match(/[A-Z]{3}/);
158
+ return iso ? iso[0] : s.length <= 4 ? s : void 0;
159
+ }
160
+ function normalizeAvailability(value) {
161
+ if (value == null) return void 0;
162
+ let s = String(value).trim();
163
+ if (!s) return void 0;
164
+ if (s.includes("schema.org/")) {
165
+ const parts = s.split("/");
166
+ s = parts[parts.length - 1] || s;
167
+ }
168
+ s = s.replace(/^https?:\/\/[^/]+\//, "");
169
+ if (s.includes("/")) {
170
+ const parts = s.split("/");
171
+ s = parts[parts.length - 1] || s;
172
+ }
173
+ return s.replace(/\s+/g, "") || void 0;
174
+ }
175
+
176
+ // src/pageCardMetadata.ts
177
+ import * as cheerio2 from "cheerio";
178
+ var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
179
+ var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
180
+ var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
181
+ var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
182
+ var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
183
+ var ENTITY_DETAIL_PATH_RES = [
184
+ /\/projects\/[^/]+/i,
185
+ /\/project\/[^/]+/i,
186
+ /\/perspectives\/[^/]+/i,
187
+ /\/perspective\/[^/]+/i,
188
+ /\/portfolio\/[^/]+/i,
189
+ /\/case-stud(?:y|ies)\/[^/]+/i,
190
+ /\/insights?\/[^/]+/i,
191
+ /\/people\/[^/]+/i,
192
+ /\/person\/[^/]+/i,
193
+ /\/team-members?\/[^/]+/i,
194
+ /\/members?\/[^/]+/i,
195
+ /\/staff\/[^/]+/i,
196
+ /\/experts?\/[^/]+/i,
197
+ /\/authors?\/[^/]+/i,
198
+ /\/leadership\/[^/]+/i,
199
+ /\/biograph(?:y|ies)\/[^/]+/i
200
+ ];
201
+ var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
202
+ var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
203
+ var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
204
+ var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
205
+ var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
206
+ var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
207
+ var CARD_PRIORITY = {
208
+ detail: 10,
209
+ listing: 6,
210
+ amenity: 5,
211
+ promotion: 2,
212
+ contact: 1,
213
+ content: 1,
214
+ blog: 0,
215
+ system: 0,
216
+ page: 3
217
+ };
218
+ var CARD_ELIGIBLE_DEFAULT = {
219
+ detail: true,
220
+ listing: true,
221
+ amenity: true,
222
+ promotion: false,
223
+ contact: false,
224
+ content: false,
225
+ blog: false,
226
+ system: false,
227
+ page: false
228
+ };
229
+ var SCHEMA_TYPE_MAP = {
230
+ product: "detail",
231
+ service: "amenity",
232
+ hotelroom: "detail",
233
+ room: "detail",
234
+ apartment: "detail",
235
+ lodgingroom: "detail",
236
+ course: "detail",
237
+ event: "detail",
238
+ offer: "promotion",
239
+ person: "detail",
240
+ employee: "detail",
241
+ profilepage: "detail",
242
+ article: "detail",
243
+ newsarticle: "detail",
244
+ blogposting: "detail",
245
+ creativework: "detail"
246
+ };
247
+ function normalizeDisplayTitle(title) {
248
+ if (!title?.trim()) return title;
249
+ let t = title.trim();
250
+ for (let i = 0; i < 2; i++) {
251
+ const dash = t.match(EN_DASH_SUFFIX_RE);
252
+ if (dash && dash.index !== void 0 && dash.index >= 4) {
253
+ t = t.slice(0, dash.index).trim();
254
+ continue;
255
+ }
256
+ const pipe = t.match(PIPE_SUFFIX_RE);
257
+ if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
258
+ t = t.slice(0, pipe.index).trim();
259
+ continue;
260
+ }
261
+ break;
262
+ }
263
+ return t || title.trim();
264
+ }
265
+ function hardExcludePage(url, title) {
266
+ const path2 = url.toLowerCase();
267
+ if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
268
+ if (BLOG_URL_RE.test(path2)) return true;
269
+ if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
270
+ try {
271
+ const u = new URL(url);
272
+ if (u.pathname === "/" || u.pathname === "") return true;
273
+ } catch {
274
+ }
275
+ return false;
276
+ }
277
+ function inferTypeFromUrl(url) {
278
+ const path2 = url.toLowerCase();
279
+ if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
280
+ if (CONTACT_URL_RE.test(path2)) return "contact";
281
+ if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
282
+ if (AMENITY_URL_RE.test(path2)) return "amenity";
283
+ if (DETAIL_URL_RE.test(path2)) return "detail";
284
+ if (LISTING_URL_RE.test(path2)) return "listing";
285
+ if (BLOG_URL_RE.test(path2)) return "blog";
286
+ return void 0;
287
+ }
288
+ function collectJsonLdNodes2(data) {
289
+ const nodes = [];
290
+ const visit = (value) => {
291
+ if (value == null) return;
292
+ if (Array.isArray(value)) {
293
+ value.forEach(visit);
294
+ return;
295
+ }
296
+ if (typeof value !== "object") return;
297
+ const obj = value;
298
+ nodes.push(obj);
299
+ if (obj["@graph"]) visit(obj["@graph"]);
300
+ };
301
+ visit(data);
302
+ return nodes;
303
+ }
304
+ function schemaTypeName(node) {
305
+ const type = node["@type"];
306
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
307
+ const raw = types[0];
308
+ if (raw == null) return "";
309
+ const s = String(raw).toLowerCase();
310
+ const slash = s.lastIndexOf("/");
311
+ return slash >= 0 ? s.slice(slash + 1) : s;
312
+ }
313
+ function inferTypeFromSchema(html) {
314
+ const $ = cheerio2.load(html);
315
+ for (const el of $('script[type="application/ld+json"]').toArray()) {
316
+ const raw = $(el).html()?.trim();
317
+ if (!raw) continue;
318
+ try {
319
+ const parsed = JSON.parse(raw);
320
+ for (const node of collectJsonLdNodes2(parsed)) {
321
+ const name = schemaTypeName(node);
322
+ if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
323
+ if (name === "product" || node.offers != null) return "detail";
324
+ }
325
+ } catch {
326
+ }
327
+ }
328
+ const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
329
+ if (ogType === "product") return "detail";
330
+ return void 0;
331
+ }
332
+ function normalizePageType(raw) {
333
+ if (!raw) return "page";
334
+ const lower = raw.toLowerCase();
335
+ const known = [
336
+ "detail",
337
+ "listing",
338
+ "amenity",
339
+ "promotion",
340
+ "contact",
341
+ "content",
342
+ "blog",
343
+ "system",
344
+ "page"
345
+ ];
346
+ if (known.includes(lower)) return lower;
347
+ if (lower === "room" || lower === "product") return "detail";
348
+ if (lower === "offer" || lower === "sale") return "promotion";
349
+ return raw;
350
+ }
351
+ function resolveDisplayTitle(input) {
352
+ const heading = input.headingTitle?.trim();
353
+ if (heading) return normalizeDisplayTitle(heading);
354
+ return normalizeDisplayTitle(input.title);
355
+ }
356
+ function resolvePageCardMetadata(input) {
357
+ const title = input.title?.trim();
358
+ const url = input.url;
359
+ const displayTitle = resolveDisplayTitle(input);
360
+ if (hardExcludePage(url, title)) {
361
+ return {
362
+ type: "system",
363
+ cardEligible: false,
364
+ cardPriority: 0,
365
+ displayTitle,
366
+ displayDescription: input.description,
367
+ displayImageUrl: input.imageUrl
368
+ };
369
+ }
370
+ let type = normalizePageType(input.type);
371
+ if (type === "page" && input.html) {
372
+ const fromSchema = inferTypeFromSchema(input.html);
373
+ if (fromSchema) type = fromSchema;
374
+ }
375
+ if (type === "page") {
376
+ const fromUrl = inferTypeFromUrl(url);
377
+ if (fromUrl) type = fromUrl;
378
+ }
379
+ if (input.hasProductPrice && type === "page") {
380
+ type = "detail";
381
+ }
382
+ const typeKey = String(type).toLowerCase();
383
+ let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
384
+ let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
385
+ if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
386
+ cardEligible = false;
387
+ }
388
+ return {
389
+ type,
390
+ cardEligible,
391
+ cardPriority,
392
+ displayTitle,
393
+ displayDescription: input.description?.trim() || void 0,
394
+ displayImageUrl: input.imageUrl
395
+ };
396
+ }
397
+
398
+ // src/htmlPageExtract.ts
399
+ var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
400
+ var DEFAULT_REMOVE_SELECTORS = [
401
+ "script",
402
+ "style",
403
+ "nav",
404
+ "header",
405
+ "footer",
406
+ ".sidebar",
407
+ ".navigation",
408
+ ".menu",
409
+ ".comments",
410
+ '[role="navigation"]',
411
+ '[role="banner"]'
412
+ ];
413
+ function urlToDocumentId(url) {
414
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
415
+ }
416
+ function cleanContent(text) {
417
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
418
+ }
419
+ function bodyTextLengthHint(html, options = {}) {
420
+ const $ = cheerio3.load(html);
421
+ stripNoiseFromDom($, options);
422
+ return cleanContent($("body").text().trim()).length;
423
+ }
424
+ function extractPageFromHtml(url, html, options = {}) {
425
+ const $ = cheerio3.load(html);
426
+ stripNoiseFromDom($, options);
427
+ const h1Title = $("h1").first().text().trim();
428
+ const docTitle = $("title").text().trim();
429
+ let title = "";
430
+ if (options.titleSelector) {
431
+ title = $(options.titleSelector).first().text().trim();
432
+ } else {
433
+ title = docTitle || h1Title;
434
+ }
435
+ if (!title) {
436
+ title = h1Title || docTitle;
437
+ }
438
+ const content = extractBestContentText($, options);
439
+ const minChars = options.minExtractedContentLength ?? 50;
440
+ const indexable = Boolean(content && content.length >= minChars);
441
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
442
+ let imageUrl;
443
+ if (image) {
444
+ try {
445
+ imageUrl = new URL(image, url).href;
446
+ } catch {
447
+ imageUrl = image;
448
+ }
449
+ }
450
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
451
+ let type = options.defaultType || "page";
452
+ if (options.typeFromUrl) {
453
+ for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
454
+ if (url.includes(pattern)) {
455
+ type = typeName;
456
+ break;
457
+ }
458
+ }
459
+ }
460
+ const productMeta = extractProductMetadata(html);
461
+ const cardMeta = resolvePageCardMetadata({
462
+ url,
463
+ title,
464
+ headingTitle: h1Title || void 0,
465
+ description,
466
+ imageUrl,
467
+ html,
468
+ type,
469
+ hasProductPrice: productMeta.price != null
470
+ });
471
+ const metadata = {
472
+ type: cardMeta.type,
473
+ cardEligible: cardMeta.cardEligible,
474
+ cardPriority: cardMeta.cardPriority,
475
+ ...title ? { title } : {},
476
+ ...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
477
+ url,
478
+ ...imageUrl ? { imageUrl } : {},
479
+ ...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
480
+ ...description ? { description } : {},
481
+ ...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
482
+ ...productMeta.price != null ? { price: productMeta.price } : {},
483
+ ...productMeta.currency ? { currency: productMeta.currency } : {},
484
+ ...productMeta.availability ? { availability: productMeta.availability } : {},
485
+ ...options.metadata
486
+ };
487
+ const previewLen = 400;
488
+ const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
489
+ return {
490
+ id: urlToDocumentId(url),
491
+ metadata,
492
+ content,
493
+ indexable,
494
+ contentPreview
495
+ };
496
+ }
497
+ function stripNoiseFromDom($, options) {
498
+ const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
499
+ removeSelectors.forEach((selector) => $(selector).remove());
500
+ }
501
+ function extractBestContentText($, options) {
502
+ const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
503
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
504
+ let best = "";
505
+ for (const sel of selectors) {
506
+ $(sel).each((_, el) => {
507
+ const t = cleanContent($(el).text().trim());
508
+ if (t.length > best.length) best = t;
509
+ });
510
+ }
511
+ const bodyText = cleanContent($("body").text().trim());
512
+ if (bodyText.length > best.length) best = bodyText;
513
+ return best;
514
+ }
515
+ function extractHeroImage($, pageUrl) {
516
+ const containers = $('main, article, [role="main"], #content, .content');
517
+ const scope = containers.length > 0 ? containers : $("body");
518
+ let best;
519
+ scope.find("img[src]").each((_, el) => {
520
+ if (best) return false;
521
+ const src = $(el).attr("src") || "";
522
+ const alt = ($(el).attr("alt") || "").toLowerCase();
523
+ const width = parseInt($(el).attr("width") || "0", 10);
524
+ const height = parseInt($(el).attr("height") || "0", 10);
525
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
526
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
527
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
528
+ if (src.includes("/_next/image")) {
529
+ try {
530
+ const nextUrl = new URL(src, pageUrl);
531
+ const realUrl = nextUrl.searchParams.get("url");
532
+ if (realUrl) {
533
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
534
+ return false;
535
+ }
536
+ } catch {
537
+ }
538
+ }
539
+ best = src;
540
+ return false;
541
+ });
542
+ return best;
543
+ }
544
+
545
+ // src/WebRAGPlugin.ts
7
546
  function bulkOpCurrentUrl(op) {
8
547
  const meta = op.document?.metadata;
9
548
  if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
@@ -22,7 +561,7 @@ function isUrlListingInsert(document) {
22
561
  return false;
23
562
  }
24
563
  }
25
- var WebRAGPlugin = class _WebRAGPlugin {
564
+ var WebRAGPlugin = class {
26
565
  name = "web-rag";
27
566
  type = "rag";
28
567
  priority;
@@ -245,13 +784,21 @@ var WebRAGPlugin = class _WebRAGPlugin {
245
784
  plugin: this.name,
246
785
  contentCount: scoredResults.length,
247
786
  types: [...new Set(scoredResults.map((d) => d.metadata.type))],
248
- topResults: scoredResults.slice(0, 5).map((doc) => ({
787
+ topResults: scoredResults.slice(0, 16).map((doc) => ({
249
788
  id: doc.id,
250
789
  type: doc.metadata.type,
251
790
  title: doc.metadata.title,
252
791
  url: doc.metadata.url,
253
792
  imageUrl: doc.metadata.imageUrl,
254
793
  description: doc.metadata.description,
794
+ cardEligible: doc.metadata.cardEligible,
795
+ cardPriority: doc.metadata.cardPriority,
796
+ displayTitle: doc.metadata.displayTitle,
797
+ displayDescription: doc.metadata.displayDescription,
798
+ displayImageUrl: doc.metadata.displayImageUrl,
799
+ ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
800
+ ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
801
+ ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
255
802
  score: doc.score
256
803
  }))
257
804
  }
@@ -1422,7 +1969,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1422
1969
  return await response.text();
1423
1970
  }
1424
1971
  extractInternalLinks(html, base, stripQueryParams) {
1425
- const $ = cheerio.load(html);
1972
+ const $ = cheerio4.load(html);
1426
1973
  const links = /* @__PURE__ */ new Set();
1427
1974
  $("a[href]").each((_, el) => {
1428
1975
  const href = ($(el).attr("href") || "").trim();
@@ -1588,7 +2135,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1588
2135
  }
1589
2136
  }
1590
2137
  try {
1591
- const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
2138
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
1592
2139
  renderMode,
1593
2140
  renderOptions,
1594
2141
  minContentLength,
@@ -1619,7 +2166,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1619
2166
  status: crawlSt,
1620
2167
  modeUsed: diag?.modeUsed,
1621
2168
  contentLength: doc?.content?.length,
1622
- bodyTextLengthHint,
2169
+ bodyTextLengthHint: bodyTextLengthHint2,
1623
2170
  title: doc?.metadata?.title,
1624
2171
  docId: doc?.id,
1625
2172
  error: diag?.errorMessage
@@ -1731,125 +2278,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
1731
2278
  const html = await response.text();
1732
2279
  return this.extractDocumentFromHtml(url, html, config);
1733
2280
  }
1734
- /**
1735
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
1736
- * would otherwise hit an empty wrapper.
1737
- */
1738
- static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1739
- stripNoiseFromDom($, config) {
1740
- const removeSelectors = config.removeSelectors || [
1741
- "script",
1742
- "style",
1743
- "nav",
1744
- "header",
1745
- "footer",
1746
- ".sidebar",
1747
- ".navigation",
1748
- ".menu",
1749
- ".comments",
1750
- '[role="navigation"]',
1751
- '[role="banner"]'
1752
- ];
1753
- removeSelectors.forEach((selector) => $(selector).remove());
1754
- }
1755
- /** Longest cleaned text among selector matches and full body (after noise strip). */
1756
- extractBestContentText($, config) {
1757
- const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1758
- const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1759
- let best = "";
1760
- for (const sel of selectors) {
1761
- $(sel).each((_, el) => {
1762
- const t = this.cleanContent($(el).text().trim());
1763
- if (t.length > best.length) best = t;
1764
- });
1765
- }
1766
- const bodyText = this.cleanContent($("body").text().trim());
1767
- if (bodyText.length > best.length) best = bodyText;
1768
- return best;
1769
- }
1770
2281
  bodyTextLengthHint(html, config) {
1771
- const $ = cheerio.load(html);
1772
- this.stripNoiseFromDom($, config);
1773
- return this.cleanContent($("body").text().trim()).length;
2282
+ return bodyTextLengthHint(html, config);
1774
2283
  }
1775
2284
  extractDocumentFromHtml(url, html, config) {
1776
- const $ = cheerio.load(html);
1777
- this.stripNoiseFromDom($, config);
1778
- const titleSelector = config.titleSelector || "h1, title";
1779
- let title = $(titleSelector).first().text().trim();
1780
- if (!title) {
1781
- title = $("title").text().trim();
1782
- }
1783
- const content = this.extractBestContentText($, config);
1784
- const minChars = config.minExtractedContentLength ?? 50;
1785
- if (!content || content.length < minChars) return null;
1786
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1787
- this.extractHeroImage($, url) || void 0;
1788
- let imageUrl;
1789
- if (image) {
1790
- try {
1791
- imageUrl = new URL(image, url).href;
1792
- } catch {
1793
- imageUrl = image;
1794
- }
1795
- }
1796
- const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1797
- let type = config.defaultType || "page";
1798
- if (config.typeFromUrl) {
1799
- for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1800
- if (url.includes(pattern)) {
1801
- type = typeName;
1802
- break;
1803
- }
1804
- }
1805
- }
1806
- const id = this.urlToId(url);
2285
+ const extracted = extractPageFromHtml(url, html, config);
2286
+ if (!extracted.indexable) return null;
1807
2287
  return {
1808
- id,
1809
- content,
1810
- metadata: {
1811
- type,
1812
- title,
1813
- url,
1814
- ...imageUrl ? { imageUrl } : {},
1815
- ...description ? { description } : {},
1816
- ...config.metadata
1817
- }
2288
+ id: extracted.id,
2289
+ content: extracted.content,
2290
+ metadata: extracted.metadata
1818
2291
  };
1819
2292
  }
1820
- /**
1821
- * Fallback image extraction: finds the first meaningful image in the content area.
1822
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
1823
- */
1824
- extractHeroImage($, pageUrl) {
1825
- const containers = $('main, article, [role="main"], #content, .content');
1826
- const scope = containers.length > 0 ? containers : $("body");
1827
- let best;
1828
- scope.find("img[src]").each((_, el) => {
1829
- if (best) return false;
1830
- const src = $(el).attr("src") || "";
1831
- const alt = ($(el).attr("alt") || "").toLowerCase();
1832
- const width = parseInt($(el).attr("width") || "0", 10);
1833
- const height = parseInt($(el).attr("height") || "0", 10);
1834
- if (width > 0 && width < 80 || height > 0 && height < 80) return;
1835
- if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1836
- if (src.startsWith("data:") || src.endsWith(".svg")) return;
1837
- if (src.includes("/_next/image")) {
1838
- try {
1839
- const nextUrl = new URL(src, pageUrl);
1840
- const realUrl = nextUrl.searchParams.get("url");
1841
- if (realUrl) {
1842
- best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1843
- return false;
1844
- }
1845
- } catch {
1846
- }
1847
- }
1848
- best = src;
1849
- return false;
1850
- });
1851
- return best;
1852
- }
1853
2293
  looksLikeDynamicShell(html) {
1854
2294
  const lower = html.toLowerCase();
1855
2295
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1867,7 +2307,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1867
2307
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1868
2308
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1869
2309
  }
1870
- diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
2310
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
1871
2311
  if (blockedSuspected) {
1872
2312
  return {
1873
2313
  doc: null,
@@ -1883,12 +2323,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1883
2323
  return {
1884
2324
  doc,
1885
2325
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1886
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
2326
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
1887
2327
  };
1888
2328
  }
1889
2329
  async crawlPageSmart(url, config, timeout, ctx) {
1890
2330
  if (ctx.renderMode === true) {
1891
- const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2331
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1892
2332
  url,
1893
2333
  config,
1894
2334
  timeout,
@@ -1897,7 +2337,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1897
2337
  );
1898
2338
  return this.diagFromRenderedAttempt(
1899
2339
  doc,
1900
- bodyTextLengthHint,
2340
+ bodyTextLengthHint2,
1901
2341
  renderFailure,
1902
2342
  blockedSuspected,
1903
2343
  "render_ok",
@@ -2014,7 +2454,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2014
2454
  }
2015
2455
  }
2016
2456
  const html = await page.content();
2017
- const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
2457
+ const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
2018
2458
  const doc = this.extractDocumentFromHtml(url, html, config);
2019
2459
  if (config.debug?.saveDir && config.debug?.enabled) {
2020
2460
  try {
@@ -2029,7 +2469,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2029
2469
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
2030
2470
  }
2031
2471
  }
2032
- return { doc, bodyTextLengthHint };
2472
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
2033
2473
  } catch (e) {
2034
2474
  const msg = String(e?.message || e || "render_failed");
2035
2475
  const lower = msg.toLowerCase();
@@ -2121,14 +2561,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
2121
2561
  /**
2122
2562
  * Clean extracted text content
2123
2563
  */
2124
- cleanContent(text) {
2125
- return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
2126
- }
2127
- /**
2128
- * Convert URL to a stable document ID
2129
- */
2130
2564
  urlToId(url) {
2131
- return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
2565
+ return urlToDocumentId(url);
2132
2566
  }
2133
2567
  /**
2134
2568
  * Delay helper
@@ -2397,5 +2831,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
2397
2831
  }
2398
2832
  };
2399
2833
  export {
2400
- WebRAGPlugin
2834
+ WebRAGPlugin,
2835
+ bodyTextLengthHint,
2836
+ extractPageFromHtml,
2837
+ extractProductMetadata,
2838
+ hardExcludePage,
2839
+ inferTypeFromUrl,
2840
+ normalizeAvailability,
2841
+ normalizeCurrency,
2842
+ normalizeDisplayTitle,
2843
+ parsePrice,
2844
+ resolvePageCardMetadata,
2845
+ urlToDocumentId
2401
2846
  };