@snap-agent/rag-web 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -805,4 +805,31 @@ declare function parsePrice(value: unknown): number | undefined;
805
805
  declare function normalizeCurrency(value: unknown): string | undefined;
806
806
  declare function normalizeAvailability(value: unknown): string | undefined;
807
807
 
808
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
808
+ /** Abstract page roles vertical-agnostic. */
809
+ type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
810
+ interface PageCardMetadataInput {
811
+ url: string;
812
+ title?: string;
813
+ /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
814
+ headingTitle?: string;
815
+ description?: string;
816
+ imageUrl?: string;
817
+ html?: string;
818
+ /** Type already resolved from typeFromUrl / defaultType. */
819
+ type?: string;
820
+ hasProductPrice?: boolean;
821
+ }
822
+ interface PageCardMetadataResult {
823
+ type: PageCardType | string;
824
+ cardEligible: boolean;
825
+ cardPriority: number;
826
+ displayTitle?: string;
827
+ displayDescription?: string;
828
+ displayImageUrl?: string;
829
+ }
830
+ declare function normalizeDisplayTitle(title?: string): string | undefined;
831
+ declare function hardExcludePage(url: string, title?: string): boolean;
832
+ declare function inferTypeFromUrl(url: string): PageCardType | undefined;
833
+ declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
834
+
835
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
package/dist/index.d.ts CHANGED
@@ -805,4 +805,31 @@ declare function parsePrice(value: unknown): number | undefined;
805
805
  declare function normalizeCurrency(value: unknown): string | undefined;
806
806
  declare function normalizeAvailability(value: unknown): string | undefined;
807
807
 
808
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
808
+ /** Abstract page roles vertical-agnostic. */
809
+ type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
810
+ interface PageCardMetadataInput {
811
+ url: string;
812
+ title?: string;
813
+ /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
814
+ headingTitle?: string;
815
+ description?: string;
816
+ imageUrl?: string;
817
+ html?: string;
818
+ /** Type already resolved from typeFromUrl / defaultType. */
819
+ type?: string;
820
+ hasProductPrice?: boolean;
821
+ }
822
+ interface PageCardMetadataResult {
823
+ type: PageCardType | string;
824
+ cardEligible: boolean;
825
+ cardPriority: number;
826
+ displayTitle?: string;
827
+ displayDescription?: string;
828
+ displayImageUrl?: string;
829
+ }
830
+ declare function normalizeDisplayTitle(title?: string): string | undefined;
831
+ declare function hardExcludePage(url: string, title?: string): boolean;
832
+ declare function inferTypeFromUrl(url: string): PageCardType | undefined;
833
+ declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
834
+
835
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
package/dist/index.js CHANGED
@@ -34,9 +34,13 @@ __export(index_exports, {
34
34
  bodyTextLengthHint: () => bodyTextLengthHint,
35
35
  extractPageFromHtml: () => extractPageFromHtml,
36
36
  extractProductMetadata: () => extractProductMetadata,
37
+ hardExcludePage: () => hardExcludePage,
38
+ inferTypeFromUrl: () => inferTypeFromUrl,
37
39
  normalizeAvailability: () => normalizeAvailability,
38
40
  normalizeCurrency: () => normalizeCurrency,
41
+ normalizeDisplayTitle: () => normalizeDisplayTitle,
39
42
  parsePrice: () => parsePrice,
43
+ resolvePageCardMetadata: () => resolvePageCardMetadata,
40
44
  urlToDocumentId: () => urlToDocumentId
41
45
  });
42
46
  module.exports = __toCommonJS(index_exports);
@@ -44,12 +48,12 @@ module.exports = __toCommonJS(index_exports);
44
48
  // src/WebRAGPlugin.ts
45
49
  var import_mongodb = require("mongodb");
46
50
  var import_openai = __toESM(require("openai"));
47
- var cheerio3 = __toESM(require("cheerio"));
51
+ var cheerio4 = __toESM(require("cheerio"));
48
52
  var fs = __toESM(require("fs"));
49
53
  var path = __toESM(require("path"));
50
54
 
51
55
  // src/htmlPageExtract.ts
52
- var cheerio2 = __toESM(require("cheerio"));
56
+ var cheerio3 = __toESM(require("cheerio"));
53
57
 
54
58
  // src/productMetadata.ts
55
59
  var cheerio = __toESM(require("cheerio"));
@@ -216,6 +220,228 @@ function normalizeAvailability(value) {
216
220
  return s.replace(/\s+/g, "") || void 0;
217
221
  }
218
222
 
223
+ // src/pageCardMetadata.ts
224
+ var cheerio2 = __toESM(require("cheerio"));
225
+ var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
226
+ var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
227
+ var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
228
+ var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
229
+ var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
230
+ var ENTITY_DETAIL_PATH_RES = [
231
+ /\/projects\/[^/]+/i,
232
+ /\/project\/[^/]+/i,
233
+ /\/perspectives\/[^/]+/i,
234
+ /\/perspective\/[^/]+/i,
235
+ /\/portfolio\/[^/]+/i,
236
+ /\/case-stud(?:y|ies)\/[^/]+/i,
237
+ /\/insights?\/[^/]+/i,
238
+ /\/people\/[^/]+/i,
239
+ /\/person\/[^/]+/i,
240
+ /\/team-members?\/[^/]+/i,
241
+ /\/members?\/[^/]+/i,
242
+ /\/staff\/[^/]+/i,
243
+ /\/experts?\/[^/]+/i,
244
+ /\/authors?\/[^/]+/i,
245
+ /\/leadership\/[^/]+/i,
246
+ /\/biograph(?:y|ies)\/[^/]+/i
247
+ ];
248
+ var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
249
+ var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
250
+ var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
251
+ var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
252
+ var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
253
+ var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
254
+ var CARD_PRIORITY = {
255
+ detail: 10,
256
+ listing: 6,
257
+ amenity: 5,
258
+ promotion: 2,
259
+ contact: 1,
260
+ content: 1,
261
+ blog: 0,
262
+ system: 0,
263
+ page: 3
264
+ };
265
+ var CARD_ELIGIBLE_DEFAULT = {
266
+ detail: true,
267
+ listing: true,
268
+ amenity: true,
269
+ promotion: false,
270
+ contact: false,
271
+ content: false,
272
+ blog: false,
273
+ system: false,
274
+ page: false
275
+ };
276
+ var SCHEMA_TYPE_MAP = {
277
+ product: "detail",
278
+ service: "amenity",
279
+ hotelroom: "detail",
280
+ room: "detail",
281
+ apartment: "detail",
282
+ lodgingroom: "detail",
283
+ course: "detail",
284
+ event: "detail",
285
+ offer: "promotion",
286
+ person: "detail",
287
+ employee: "detail",
288
+ profilepage: "detail",
289
+ article: "detail",
290
+ newsarticle: "detail",
291
+ blogposting: "detail",
292
+ creativework: "detail"
293
+ };
294
+ function normalizeDisplayTitle(title) {
295
+ if (!title?.trim()) return title;
296
+ let t = title.trim();
297
+ for (let i = 0; i < 2; i++) {
298
+ const dash = t.match(EN_DASH_SUFFIX_RE);
299
+ if (dash && dash.index !== void 0 && dash.index >= 4) {
300
+ t = t.slice(0, dash.index).trim();
301
+ continue;
302
+ }
303
+ const pipe = t.match(PIPE_SUFFIX_RE);
304
+ if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
305
+ t = t.slice(0, pipe.index).trim();
306
+ continue;
307
+ }
308
+ break;
309
+ }
310
+ return t || title.trim();
311
+ }
312
+ function hardExcludePage(url, title) {
313
+ const path2 = url.toLowerCase();
314
+ if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
315
+ if (BLOG_URL_RE.test(path2)) return true;
316
+ if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
317
+ try {
318
+ const u = new URL(url);
319
+ if (u.pathname === "/" || u.pathname === "") return true;
320
+ } catch {
321
+ }
322
+ return false;
323
+ }
324
+ function inferTypeFromUrl(url) {
325
+ const path2 = url.toLowerCase();
326
+ if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
327
+ if (CONTACT_URL_RE.test(path2)) return "contact";
328
+ if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
329
+ if (AMENITY_URL_RE.test(path2)) return "amenity";
330
+ if (DETAIL_URL_RE.test(path2)) return "detail";
331
+ if (LISTING_URL_RE.test(path2)) return "listing";
332
+ if (BLOG_URL_RE.test(path2)) return "blog";
333
+ return void 0;
334
+ }
335
+ function collectJsonLdNodes2(data) {
336
+ const nodes = [];
337
+ const visit = (value) => {
338
+ if (value == null) return;
339
+ if (Array.isArray(value)) {
340
+ value.forEach(visit);
341
+ return;
342
+ }
343
+ if (typeof value !== "object") return;
344
+ const obj = value;
345
+ nodes.push(obj);
346
+ if (obj["@graph"]) visit(obj["@graph"]);
347
+ };
348
+ visit(data);
349
+ return nodes;
350
+ }
351
+ function schemaTypeName(node) {
352
+ const type = node["@type"];
353
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
354
+ const raw = types[0];
355
+ if (raw == null) return "";
356
+ const s = String(raw).toLowerCase();
357
+ const slash = s.lastIndexOf("/");
358
+ return slash >= 0 ? s.slice(slash + 1) : s;
359
+ }
360
+ function inferTypeFromSchema(html) {
361
+ const $ = cheerio2.load(html);
362
+ for (const el of $('script[type="application/ld+json"]').toArray()) {
363
+ const raw = $(el).html()?.trim();
364
+ if (!raw) continue;
365
+ try {
366
+ const parsed = JSON.parse(raw);
367
+ for (const node of collectJsonLdNodes2(parsed)) {
368
+ const name = schemaTypeName(node);
369
+ if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
370
+ if (name === "product" || node.offers != null) return "detail";
371
+ }
372
+ } catch {
373
+ }
374
+ }
375
+ const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
376
+ if (ogType === "product") return "detail";
377
+ return void 0;
378
+ }
379
+ function normalizePageType(raw) {
380
+ if (!raw) return "page";
381
+ const lower = raw.toLowerCase();
382
+ const known = [
383
+ "detail",
384
+ "listing",
385
+ "amenity",
386
+ "promotion",
387
+ "contact",
388
+ "content",
389
+ "blog",
390
+ "system",
391
+ "page"
392
+ ];
393
+ if (known.includes(lower)) return lower;
394
+ if (lower === "room" || lower === "product") return "detail";
395
+ if (lower === "offer" || lower === "sale") return "promotion";
396
+ return raw;
397
+ }
398
+ function resolveDisplayTitle(input) {
399
+ const heading = input.headingTitle?.trim();
400
+ if (heading) return normalizeDisplayTitle(heading);
401
+ return normalizeDisplayTitle(input.title);
402
+ }
403
+ function resolvePageCardMetadata(input) {
404
+ const title = input.title?.trim();
405
+ const url = input.url;
406
+ const displayTitle = resolveDisplayTitle(input);
407
+ if (hardExcludePage(url, title)) {
408
+ return {
409
+ type: "system",
410
+ cardEligible: false,
411
+ cardPriority: 0,
412
+ displayTitle,
413
+ displayDescription: input.description,
414
+ displayImageUrl: input.imageUrl
415
+ };
416
+ }
417
+ let type = normalizePageType(input.type);
418
+ if (type === "page" && input.html) {
419
+ const fromSchema = inferTypeFromSchema(input.html);
420
+ if (fromSchema) type = fromSchema;
421
+ }
422
+ if (type === "page") {
423
+ const fromUrl = inferTypeFromUrl(url);
424
+ if (fromUrl) type = fromUrl;
425
+ }
426
+ if (input.hasProductPrice && type === "page") {
427
+ type = "detail";
428
+ }
429
+ const typeKey = String(type).toLowerCase();
430
+ let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
431
+ let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
432
+ if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
433
+ cardEligible = false;
434
+ }
435
+ return {
436
+ type,
437
+ cardEligible,
438
+ cardPriority,
439
+ displayTitle,
440
+ displayDescription: input.description?.trim() || void 0,
441
+ displayImageUrl: input.imageUrl
442
+ };
443
+ }
444
+
219
445
  // src/htmlPageExtract.ts
220
446
  var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
221
447
  var DEFAULT_REMOVE_SELECTORS = [
@@ -238,17 +464,23 @@ function cleanContent(text) {
238
464
  return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
239
465
  }
240
466
  function bodyTextLengthHint(html, options = {}) {
241
- const $ = cheerio2.load(html);
467
+ const $ = cheerio3.load(html);
242
468
  stripNoiseFromDom($, options);
243
469
  return cleanContent($("body").text().trim()).length;
244
470
  }
245
471
  function extractPageFromHtml(url, html, options = {}) {
246
- const $ = cheerio2.load(html);
472
+ const $ = cheerio3.load(html);
247
473
  stripNoiseFromDom($, options);
248
- const titleSelector = options.titleSelector || "h1, title";
249
- let title = $(titleSelector).first().text().trim();
474
+ const h1Title = $("h1").first().text().trim();
475
+ const docTitle = $("title").text().trim();
476
+ let title = "";
477
+ if (options.titleSelector) {
478
+ title = $(options.titleSelector).first().text().trim();
479
+ } else {
480
+ title = docTitle || h1Title;
481
+ }
250
482
  if (!title) {
251
- title = $("title").text().trim();
483
+ title = h1Title || docTitle;
252
484
  }
253
485
  const content = extractBestContentText($, options);
254
486
  const minChars = options.minExtractedContentLength ?? 50;
@@ -273,12 +505,27 @@ function extractPageFromHtml(url, html, options = {}) {
273
505
  }
274
506
  }
275
507
  const productMeta = extractProductMetadata(html);
276
- const metadata = {
508
+ const cardMeta = resolvePageCardMetadata({
509
+ url,
510
+ title,
511
+ headingTitle: h1Title || void 0,
512
+ description,
513
+ imageUrl,
514
+ html,
277
515
  type,
516
+ hasProductPrice: productMeta.price != null
517
+ });
518
+ const metadata = {
519
+ type: cardMeta.type,
520
+ cardEligible: cardMeta.cardEligible,
521
+ cardPriority: cardMeta.cardPriority,
278
522
  ...title ? { title } : {},
523
+ ...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
279
524
  url,
280
525
  ...imageUrl ? { imageUrl } : {},
526
+ ...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
281
527
  ...description ? { description } : {},
528
+ ...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
282
529
  ...productMeta.price != null ? { price: productMeta.price } : {},
283
530
  ...productMeta.currency ? { currency: productMeta.currency } : {},
284
531
  ...productMeta.availability ? { availability: productMeta.availability } : {},
@@ -584,13 +831,18 @@ var WebRAGPlugin = class {
584
831
  plugin: this.name,
585
832
  contentCount: scoredResults.length,
586
833
  types: [...new Set(scoredResults.map((d) => d.metadata.type))],
587
- topResults: scoredResults.slice(0, 5).map((doc) => ({
834
+ topResults: scoredResults.slice(0, 16).map((doc) => ({
588
835
  id: doc.id,
589
836
  type: doc.metadata.type,
590
837
  title: doc.metadata.title,
591
838
  url: doc.metadata.url,
592
839
  imageUrl: doc.metadata.imageUrl,
593
840
  description: doc.metadata.description,
841
+ cardEligible: doc.metadata.cardEligible,
842
+ cardPriority: doc.metadata.cardPriority,
843
+ displayTitle: doc.metadata.displayTitle,
844
+ displayDescription: doc.metadata.displayDescription,
845
+ displayImageUrl: doc.metadata.displayImageUrl,
594
846
  ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
595
847
  ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
596
848
  ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
@@ -1764,7 +2016,7 @@ var WebRAGPlugin = class {
1764
2016
  return await response.text();
1765
2017
  }
1766
2018
  extractInternalLinks(html, base, stripQueryParams) {
1767
- const $ = cheerio3.load(html);
2019
+ const $ = cheerio4.load(html);
1768
2020
  const links = /* @__PURE__ */ new Set();
1769
2021
  $("a[href]").each((_, el) => {
1770
2022
  const href = ($(el).attr("href") || "").trim();
@@ -2631,8 +2883,12 @@ var WebRAGPlugin = class {
2631
2883
  bodyTextLengthHint,
2632
2884
  extractPageFromHtml,
2633
2885
  extractProductMetadata,
2886
+ hardExcludePage,
2887
+ inferTypeFromUrl,
2634
2888
  normalizeAvailability,
2635
2889
  normalizeCurrency,
2890
+ normalizeDisplayTitle,
2636
2891
  parsePrice,
2892
+ resolvePageCardMetadata,
2637
2893
  urlToDocumentId
2638
2894
  });
package/dist/index.mjs CHANGED
@@ -1,12 +1,12 @@
1
1
  // src/WebRAGPlugin.ts
2
2
  import { MongoClient } from "mongodb";
3
3
  import OpenAI from "openai";
4
- import * as cheerio3 from "cheerio";
4
+ import * as cheerio4 from "cheerio";
5
5
  import * as fs from "fs";
6
6
  import * as path from "path";
7
7
 
8
8
  // src/htmlPageExtract.ts
9
- import * as cheerio2 from "cheerio";
9
+ import * as cheerio3 from "cheerio";
10
10
 
11
11
  // src/productMetadata.ts
12
12
  import * as cheerio from "cheerio";
@@ -173,6 +173,228 @@ function normalizeAvailability(value) {
173
173
  return s.replace(/\s+/g, "") || void 0;
174
174
  }
175
175
 
176
+ // src/pageCardMetadata.ts
177
+ import * as cheerio2 from "cheerio";
178
+ var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
179
+ var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
180
+ var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
181
+ var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
182
+ var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
183
+ var ENTITY_DETAIL_PATH_RES = [
184
+ /\/projects\/[^/]+/i,
185
+ /\/project\/[^/]+/i,
186
+ /\/perspectives\/[^/]+/i,
187
+ /\/perspective\/[^/]+/i,
188
+ /\/portfolio\/[^/]+/i,
189
+ /\/case-stud(?:y|ies)\/[^/]+/i,
190
+ /\/insights?\/[^/]+/i,
191
+ /\/people\/[^/]+/i,
192
+ /\/person\/[^/]+/i,
193
+ /\/team-members?\/[^/]+/i,
194
+ /\/members?\/[^/]+/i,
195
+ /\/staff\/[^/]+/i,
196
+ /\/experts?\/[^/]+/i,
197
+ /\/authors?\/[^/]+/i,
198
+ /\/leadership\/[^/]+/i,
199
+ /\/biograph(?:y|ies)\/[^/]+/i
200
+ ];
201
+ var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
202
+ var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
203
+ var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
204
+ var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
205
+ var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
206
+ var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
207
+ var CARD_PRIORITY = {
208
+ detail: 10,
209
+ listing: 6,
210
+ amenity: 5,
211
+ promotion: 2,
212
+ contact: 1,
213
+ content: 1,
214
+ blog: 0,
215
+ system: 0,
216
+ page: 3
217
+ };
218
+ var CARD_ELIGIBLE_DEFAULT = {
219
+ detail: true,
220
+ listing: true,
221
+ amenity: true,
222
+ promotion: false,
223
+ contact: false,
224
+ content: false,
225
+ blog: false,
226
+ system: false,
227
+ page: false
228
+ };
229
+ var SCHEMA_TYPE_MAP = {
230
+ product: "detail",
231
+ service: "amenity",
232
+ hotelroom: "detail",
233
+ room: "detail",
234
+ apartment: "detail",
235
+ lodgingroom: "detail",
236
+ course: "detail",
237
+ event: "detail",
238
+ offer: "promotion",
239
+ person: "detail",
240
+ employee: "detail",
241
+ profilepage: "detail",
242
+ article: "detail",
243
+ newsarticle: "detail",
244
+ blogposting: "detail",
245
+ creativework: "detail"
246
+ };
247
+ function normalizeDisplayTitle(title) {
248
+ if (!title?.trim()) return title;
249
+ let t = title.trim();
250
+ for (let i = 0; i < 2; i++) {
251
+ const dash = t.match(EN_DASH_SUFFIX_RE);
252
+ if (dash && dash.index !== void 0 && dash.index >= 4) {
253
+ t = t.slice(0, dash.index).trim();
254
+ continue;
255
+ }
256
+ const pipe = t.match(PIPE_SUFFIX_RE);
257
+ if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
258
+ t = t.slice(0, pipe.index).trim();
259
+ continue;
260
+ }
261
+ break;
262
+ }
263
+ return t || title.trim();
264
+ }
265
+ function hardExcludePage(url, title) {
266
+ const path2 = url.toLowerCase();
267
+ if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
268
+ if (BLOG_URL_RE.test(path2)) return true;
269
+ if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
270
+ try {
271
+ const u = new URL(url);
272
+ if (u.pathname === "/" || u.pathname === "") return true;
273
+ } catch {
274
+ }
275
+ return false;
276
+ }
277
+ function inferTypeFromUrl(url) {
278
+ const path2 = url.toLowerCase();
279
+ if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
280
+ if (CONTACT_URL_RE.test(path2)) return "contact";
281
+ if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
282
+ if (AMENITY_URL_RE.test(path2)) return "amenity";
283
+ if (DETAIL_URL_RE.test(path2)) return "detail";
284
+ if (LISTING_URL_RE.test(path2)) return "listing";
285
+ if (BLOG_URL_RE.test(path2)) return "blog";
286
+ return void 0;
287
+ }
288
+ function collectJsonLdNodes2(data) {
289
+ const nodes = [];
290
+ const visit = (value) => {
291
+ if (value == null) return;
292
+ if (Array.isArray(value)) {
293
+ value.forEach(visit);
294
+ return;
295
+ }
296
+ if (typeof value !== "object") return;
297
+ const obj = value;
298
+ nodes.push(obj);
299
+ if (obj["@graph"]) visit(obj["@graph"]);
300
+ };
301
+ visit(data);
302
+ return nodes;
303
+ }
304
+ function schemaTypeName(node) {
305
+ const type = node["@type"];
306
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
307
+ const raw = types[0];
308
+ if (raw == null) return "";
309
+ const s = String(raw).toLowerCase();
310
+ const slash = s.lastIndexOf("/");
311
+ return slash >= 0 ? s.slice(slash + 1) : s;
312
+ }
313
+ function inferTypeFromSchema(html) {
314
+ const $ = cheerio2.load(html);
315
+ for (const el of $('script[type="application/ld+json"]').toArray()) {
316
+ const raw = $(el).html()?.trim();
317
+ if (!raw) continue;
318
+ try {
319
+ const parsed = JSON.parse(raw);
320
+ for (const node of collectJsonLdNodes2(parsed)) {
321
+ const name = schemaTypeName(node);
322
+ if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
323
+ if (name === "product" || node.offers != null) return "detail";
324
+ }
325
+ } catch {
326
+ }
327
+ }
328
+ const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
329
+ if (ogType === "product") return "detail";
330
+ return void 0;
331
+ }
332
+ function normalizePageType(raw) {
333
+ if (!raw) return "page";
334
+ const lower = raw.toLowerCase();
335
+ const known = [
336
+ "detail",
337
+ "listing",
338
+ "amenity",
339
+ "promotion",
340
+ "contact",
341
+ "content",
342
+ "blog",
343
+ "system",
344
+ "page"
345
+ ];
346
+ if (known.includes(lower)) return lower;
347
+ if (lower === "room" || lower === "product") return "detail";
348
+ if (lower === "offer" || lower === "sale") return "promotion";
349
+ return raw;
350
+ }
351
+ function resolveDisplayTitle(input) {
352
+ const heading = input.headingTitle?.trim();
353
+ if (heading) return normalizeDisplayTitle(heading);
354
+ return normalizeDisplayTitle(input.title);
355
+ }
356
+ function resolvePageCardMetadata(input) {
357
+ const title = input.title?.trim();
358
+ const url = input.url;
359
+ const displayTitle = resolveDisplayTitle(input);
360
+ if (hardExcludePage(url, title)) {
361
+ return {
362
+ type: "system",
363
+ cardEligible: false,
364
+ cardPriority: 0,
365
+ displayTitle,
366
+ displayDescription: input.description,
367
+ displayImageUrl: input.imageUrl
368
+ };
369
+ }
370
+ let type = normalizePageType(input.type);
371
+ if (type === "page" && input.html) {
372
+ const fromSchema = inferTypeFromSchema(input.html);
373
+ if (fromSchema) type = fromSchema;
374
+ }
375
+ if (type === "page") {
376
+ const fromUrl = inferTypeFromUrl(url);
377
+ if (fromUrl) type = fromUrl;
378
+ }
379
+ if (input.hasProductPrice && type === "page") {
380
+ type = "detail";
381
+ }
382
+ const typeKey = String(type).toLowerCase();
383
+ let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
384
+ let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
385
+ if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
386
+ cardEligible = false;
387
+ }
388
+ return {
389
+ type,
390
+ cardEligible,
391
+ cardPriority,
392
+ displayTitle,
393
+ displayDescription: input.description?.trim() || void 0,
394
+ displayImageUrl: input.imageUrl
395
+ };
396
+ }
397
+
176
398
  // src/htmlPageExtract.ts
177
399
  var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
178
400
  var DEFAULT_REMOVE_SELECTORS = [
@@ -195,17 +417,23 @@ function cleanContent(text) {
195
417
  return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
196
418
  }
197
419
  function bodyTextLengthHint(html, options = {}) {
198
- const $ = cheerio2.load(html);
420
+ const $ = cheerio3.load(html);
199
421
  stripNoiseFromDom($, options);
200
422
  return cleanContent($("body").text().trim()).length;
201
423
  }
202
424
  function extractPageFromHtml(url, html, options = {}) {
203
- const $ = cheerio2.load(html);
425
+ const $ = cheerio3.load(html);
204
426
  stripNoiseFromDom($, options);
205
- const titleSelector = options.titleSelector || "h1, title";
206
- let title = $(titleSelector).first().text().trim();
427
+ const h1Title = $("h1").first().text().trim();
428
+ const docTitle = $("title").text().trim();
429
+ let title = "";
430
+ if (options.titleSelector) {
431
+ title = $(options.titleSelector).first().text().trim();
432
+ } else {
433
+ title = docTitle || h1Title;
434
+ }
207
435
  if (!title) {
208
- title = $("title").text().trim();
436
+ title = h1Title || docTitle;
209
437
  }
210
438
  const content = extractBestContentText($, options);
211
439
  const minChars = options.minExtractedContentLength ?? 50;
@@ -230,12 +458,27 @@ function extractPageFromHtml(url, html, options = {}) {
230
458
  }
231
459
  }
232
460
  const productMeta = extractProductMetadata(html);
233
- const metadata = {
461
+ const cardMeta = resolvePageCardMetadata({
462
+ url,
463
+ title,
464
+ headingTitle: h1Title || void 0,
465
+ description,
466
+ imageUrl,
467
+ html,
234
468
  type,
469
+ hasProductPrice: productMeta.price != null
470
+ });
471
+ const metadata = {
472
+ type: cardMeta.type,
473
+ cardEligible: cardMeta.cardEligible,
474
+ cardPriority: cardMeta.cardPriority,
235
475
  ...title ? { title } : {},
476
+ ...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
236
477
  url,
237
478
  ...imageUrl ? { imageUrl } : {},
479
+ ...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
238
480
  ...description ? { description } : {},
481
+ ...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
239
482
  ...productMeta.price != null ? { price: productMeta.price } : {},
240
483
  ...productMeta.currency ? { currency: productMeta.currency } : {},
241
484
  ...productMeta.availability ? { availability: productMeta.availability } : {},
@@ -541,13 +784,18 @@ var WebRAGPlugin = class {
541
784
  plugin: this.name,
542
785
  contentCount: scoredResults.length,
543
786
  types: [...new Set(scoredResults.map((d) => d.metadata.type))],
544
- topResults: scoredResults.slice(0, 5).map((doc) => ({
787
+ topResults: scoredResults.slice(0, 16).map((doc) => ({
545
788
  id: doc.id,
546
789
  type: doc.metadata.type,
547
790
  title: doc.metadata.title,
548
791
  url: doc.metadata.url,
549
792
  imageUrl: doc.metadata.imageUrl,
550
793
  description: doc.metadata.description,
794
+ cardEligible: doc.metadata.cardEligible,
795
+ cardPriority: doc.metadata.cardPriority,
796
+ displayTitle: doc.metadata.displayTitle,
797
+ displayDescription: doc.metadata.displayDescription,
798
+ displayImageUrl: doc.metadata.displayImageUrl,
551
799
  ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
552
800
  ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
553
801
  ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
@@ -1721,7 +1969,7 @@ var WebRAGPlugin = class {
1721
1969
  return await response.text();
1722
1970
  }
1723
1971
  extractInternalLinks(html, base, stripQueryParams) {
1724
- const $ = cheerio3.load(html);
1972
+ const $ = cheerio4.load(html);
1725
1973
  const links = /* @__PURE__ */ new Set();
1726
1974
  $("a[href]").each((_, el) => {
1727
1975
  const href = ($(el).attr("href") || "").trim();
@@ -2587,8 +2835,12 @@ export {
2587
2835
  bodyTextLengthHint,
2588
2836
  extractPageFromHtml,
2589
2837
  extractProductMetadata,
2838
+ hardExcludePage,
2839
+ inferTypeFromUrl,
2590
2840
  normalizeAvailability,
2591
2841
  normalizeCurrency,
2842
+ normalizeDisplayTitle,
2592
2843
  parsePrice,
2844
+ resolvePageCardMetadata,
2593
2845
  urlToDocumentId
2594
2846
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.6",
3
+ "version": "0.1.7",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",