@snap-agent/rag-web 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -114,6 +114,12 @@ interface CrawlPageStatusEntry {
114
114
  httpStatus?: number;
115
115
  error?: string;
116
116
  skippedReason?: string;
117
+ /**
118
+ * Same-origin internal links found on this page, populated only when `extractLinks` is set on
119
+ * the crawl config. Enables resumable recursive (BFS) crawling: the caller feeds these back into
120
+ * its own frontier instead of the SDK doing a separate link-discovery fetch.
121
+ */
122
+ links?: string[];
117
123
  }
118
124
  interface CrawlLedgerDocument {
119
125
  tenantId: string;
@@ -258,6 +264,10 @@ interface SitemapConfig {
258
264
  */
259
265
  debug?: DebugOptions;
260
266
  crawlLedger?: CrawlLedgerOptions;
267
+ /** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
268
+ extractLinks?: boolean;
269
+ /** Max links kept per page when `extractLinks` is set (default: 200). */
270
+ maxLinksPerPage?: number;
261
271
  }
262
272
  /**
263
273
  * Direct URL list crawling configuration
@@ -277,6 +287,10 @@ interface UrlListConfig {
277
287
  debug?: DebugOptions;
278
288
  stripQueryParams?: boolean;
279
289
  crawlLedger?: CrawlLedgerOptions;
290
+ /** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
291
+ extractLinks?: boolean;
292
+ /** Max links kept per page when `extractLinks` is set (default: 200). */
293
+ maxLinksPerPage?: number;
280
294
  }
281
295
  /**
282
296
  * Single page ingestion (no discovery)
@@ -321,6 +335,10 @@ interface WebsiteCrawlConfig {
321
335
  renderOptions?: RenderOptions;
322
336
  debug?: DebugOptions;
323
337
  crawlLedger?: CrawlLedgerOptions;
338
+ /** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
339
+ extractLinks?: boolean;
340
+ /** Max links kept per page when `extractLinks` is set (default: 200). */
341
+ maxLinksPerPage?: number;
324
342
  }
325
343
  interface RenderOptions {
326
344
  /**
@@ -661,6 +679,12 @@ declare class WebRAGPlugin implements RAGPlugin {
661
679
  private normalizeWebsiteUrl;
662
680
  private fetchHtml;
663
681
  private extractInternalLinks;
682
+ /**
683
+ * When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
684
+ * caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
685
+ * undefined when disabled or on any parse error (link extraction must never fail a crawl).
686
+ */
687
+ private extractLinksIfEnabled;
664
688
  /**
665
689
  * Ingest content from a list of URLs
666
690
  *
@@ -805,4 +829,31 @@ declare function parsePrice(value: unknown): number | undefined;
805
829
  declare function normalizeCurrency(value: unknown): string | undefined;
806
830
  declare function normalizeAvailability(value: unknown): string | undefined;
807
831
 
808
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
832
+ /** Abstract page roles vertical-agnostic. */
833
+ type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
834
+ interface PageCardMetadataInput {
835
+ url: string;
836
+ title?: string;
837
+ /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
838
+ headingTitle?: string;
839
+ description?: string;
840
+ imageUrl?: string;
841
+ html?: string;
842
+ /** Type already resolved from typeFromUrl / defaultType. */
843
+ type?: string;
844
+ hasProductPrice?: boolean;
845
+ }
846
+ interface PageCardMetadataResult {
847
+ type: PageCardType | string;
848
+ cardEligible: boolean;
849
+ cardPriority: number;
850
+ displayTitle?: string;
851
+ displayDescription?: string;
852
+ displayImageUrl?: string;
853
+ }
854
+ declare function normalizeDisplayTitle(title?: string): string | undefined;
855
+ declare function hardExcludePage(url: string, title?: string): boolean;
856
+ declare function inferTypeFromUrl(url: string): PageCardType | undefined;
857
+ declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
858
+
859
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
package/dist/index.d.ts CHANGED
@@ -114,6 +114,12 @@ interface CrawlPageStatusEntry {
114
114
  httpStatus?: number;
115
115
  error?: string;
116
116
  skippedReason?: string;
117
+ /**
118
+ * Same-origin internal links found on this page, populated only when `extractLinks` is set on
119
+ * the crawl config. Enables resumable recursive (BFS) crawling: the caller feeds these back into
120
+ * its own frontier instead of the SDK doing a separate link-discovery fetch.
121
+ */
122
+ links?: string[];
117
123
  }
118
124
  interface CrawlLedgerDocument {
119
125
  tenantId: string;
@@ -258,6 +264,10 @@ interface SitemapConfig {
258
264
  */
259
265
  debug?: DebugOptions;
260
266
  crawlLedger?: CrawlLedgerOptions;
267
+ /** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
268
+ extractLinks?: boolean;
269
+ /** Max links kept per page when `extractLinks` is set (default: 200). */
270
+ maxLinksPerPage?: number;
261
271
  }
262
272
  /**
263
273
  * Direct URL list crawling configuration
@@ -277,6 +287,10 @@ interface UrlListConfig {
277
287
  debug?: DebugOptions;
278
288
  stripQueryParams?: boolean;
279
289
  crawlLedger?: CrawlLedgerOptions;
290
+ /** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
291
+ extractLinks?: boolean;
292
+ /** Max links kept per page when `extractLinks` is set (default: 200). */
293
+ maxLinksPerPage?: number;
280
294
  }
281
295
  /**
282
296
  * Single page ingestion (no discovery)
@@ -321,6 +335,10 @@ interface WebsiteCrawlConfig {
321
335
  renderOptions?: RenderOptions;
322
336
  debug?: DebugOptions;
323
337
  crawlLedger?: CrawlLedgerOptions;
338
+ /** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
339
+ extractLinks?: boolean;
340
+ /** Max links kept per page when `extractLinks` is set (default: 200). */
341
+ maxLinksPerPage?: number;
324
342
  }
325
343
  interface RenderOptions {
326
344
  /**
@@ -661,6 +679,12 @@ declare class WebRAGPlugin implements RAGPlugin {
661
679
  private normalizeWebsiteUrl;
662
680
  private fetchHtml;
663
681
  private extractInternalLinks;
682
+ /**
683
+ * When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
684
+ * caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
685
+ * undefined when disabled or on any parse error (link extraction must never fail a crawl).
686
+ */
687
+ private extractLinksIfEnabled;
664
688
  /**
665
689
  * Ingest content from a list of URLs
666
690
  *
@@ -805,4 +829,31 @@ declare function parsePrice(value: unknown): number | undefined;
805
829
  declare function normalizeCurrency(value: unknown): string | undefined;
806
830
  declare function normalizeAvailability(value: unknown): string | undefined;
807
831
 
808
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
832
+ /** Abstract page roles vertical-agnostic. */
833
+ type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
834
+ interface PageCardMetadataInput {
835
+ url: string;
836
+ title?: string;
837
+ /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
838
+ headingTitle?: string;
839
+ description?: string;
840
+ imageUrl?: string;
841
+ html?: string;
842
+ /** Type already resolved from typeFromUrl / defaultType. */
843
+ type?: string;
844
+ hasProductPrice?: boolean;
845
+ }
846
+ interface PageCardMetadataResult {
847
+ type: PageCardType | string;
848
+ cardEligible: boolean;
849
+ cardPriority: number;
850
+ displayTitle?: string;
851
+ displayDescription?: string;
852
+ displayImageUrl?: string;
853
+ }
854
+ declare function normalizeDisplayTitle(title?: string): string | undefined;
855
+ declare function hardExcludePage(url: string, title?: string): boolean;
856
+ declare function inferTypeFromUrl(url: string): PageCardType | undefined;
857
+ declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
858
+
859
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
package/dist/index.js CHANGED
@@ -34,9 +34,13 @@ __export(index_exports, {
34
34
  bodyTextLengthHint: () => bodyTextLengthHint,
35
35
  extractPageFromHtml: () => extractPageFromHtml,
36
36
  extractProductMetadata: () => extractProductMetadata,
37
+ hardExcludePage: () => hardExcludePage,
38
+ inferTypeFromUrl: () => inferTypeFromUrl,
37
39
  normalizeAvailability: () => normalizeAvailability,
38
40
  normalizeCurrency: () => normalizeCurrency,
41
+ normalizeDisplayTitle: () => normalizeDisplayTitle,
39
42
  parsePrice: () => parsePrice,
43
+ resolvePageCardMetadata: () => resolvePageCardMetadata,
40
44
  urlToDocumentId: () => urlToDocumentId
41
45
  });
42
46
  module.exports = __toCommonJS(index_exports);
@@ -44,12 +48,12 @@ module.exports = __toCommonJS(index_exports);
44
48
  // src/WebRAGPlugin.ts
45
49
  var import_mongodb = require("mongodb");
46
50
  var import_openai = __toESM(require("openai"));
47
- var cheerio3 = __toESM(require("cheerio"));
51
+ var cheerio4 = __toESM(require("cheerio"));
48
52
  var fs = __toESM(require("fs"));
49
53
  var path = __toESM(require("path"));
50
54
 
51
55
  // src/htmlPageExtract.ts
52
- var cheerio2 = __toESM(require("cheerio"));
56
+ var cheerio3 = __toESM(require("cheerio"));
53
57
 
54
58
  // src/productMetadata.ts
55
59
  var cheerio = __toESM(require("cheerio"));
@@ -216,6 +220,228 @@ function normalizeAvailability(value) {
216
220
  return s.replace(/\s+/g, "") || void 0;
217
221
  }
218
222
 
223
+ // src/pageCardMetadata.ts
224
+ var cheerio2 = __toESM(require("cheerio"));
225
+ var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
226
+ var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
227
+ var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
228
+ var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
229
+ var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
230
+ var ENTITY_DETAIL_PATH_RES = [
231
+ /\/projects\/[^/]+/i,
232
+ /\/project\/[^/]+/i,
233
+ /\/perspectives\/[^/]+/i,
234
+ /\/perspective\/[^/]+/i,
235
+ /\/portfolio\/[^/]+/i,
236
+ /\/case-stud(?:y|ies)\/[^/]+/i,
237
+ /\/insights?\/[^/]+/i,
238
+ /\/people\/[^/]+/i,
239
+ /\/person\/[^/]+/i,
240
+ /\/team-members?\/[^/]+/i,
241
+ /\/members?\/[^/]+/i,
242
+ /\/staff\/[^/]+/i,
243
+ /\/experts?\/[^/]+/i,
244
+ /\/authors?\/[^/]+/i,
245
+ /\/leadership\/[^/]+/i,
246
+ /\/biograph(?:y|ies)\/[^/]+/i
247
+ ];
248
+ var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
249
+ var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
250
+ var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
251
+ var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
252
+ var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
253
+ var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
254
+ var CARD_PRIORITY = {
255
+ detail: 10,
256
+ listing: 6,
257
+ amenity: 5,
258
+ promotion: 2,
259
+ contact: 1,
260
+ content: 1,
261
+ blog: 0,
262
+ system: 0,
263
+ page: 3
264
+ };
265
+ var CARD_ELIGIBLE_DEFAULT = {
266
+ detail: true,
267
+ listing: true,
268
+ amenity: true,
269
+ promotion: false,
270
+ contact: false,
271
+ content: false,
272
+ blog: false,
273
+ system: false,
274
+ page: false
275
+ };
276
+ var SCHEMA_TYPE_MAP = {
277
+ product: "detail",
278
+ service: "amenity",
279
+ hotelroom: "detail",
280
+ room: "detail",
281
+ apartment: "detail",
282
+ lodgingroom: "detail",
283
+ course: "detail",
284
+ event: "detail",
285
+ offer: "promotion",
286
+ person: "detail",
287
+ employee: "detail",
288
+ profilepage: "detail",
289
+ article: "detail",
290
+ newsarticle: "detail",
291
+ blogposting: "detail",
292
+ creativework: "detail"
293
+ };
294
+ function normalizeDisplayTitle(title) {
295
+ if (!title?.trim()) return title;
296
+ let t = title.trim();
297
+ for (let i = 0; i < 2; i++) {
298
+ const dash = t.match(EN_DASH_SUFFIX_RE);
299
+ if (dash && dash.index !== void 0 && dash.index >= 4) {
300
+ t = t.slice(0, dash.index).trim();
301
+ continue;
302
+ }
303
+ const pipe = t.match(PIPE_SUFFIX_RE);
304
+ if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
305
+ t = t.slice(0, pipe.index).trim();
306
+ continue;
307
+ }
308
+ break;
309
+ }
310
+ return t || title.trim();
311
+ }
312
+ function hardExcludePage(url, title) {
313
+ const path2 = url.toLowerCase();
314
+ if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
315
+ if (BLOG_URL_RE.test(path2)) return true;
316
+ if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
317
+ try {
318
+ const u = new URL(url);
319
+ if (u.pathname === "/" || u.pathname === "") return true;
320
+ } catch {
321
+ }
322
+ return false;
323
+ }
324
+ function inferTypeFromUrl(url) {
325
+ const path2 = url.toLowerCase();
326
+ if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
327
+ if (CONTACT_URL_RE.test(path2)) return "contact";
328
+ if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
329
+ if (AMENITY_URL_RE.test(path2)) return "amenity";
330
+ if (DETAIL_URL_RE.test(path2)) return "detail";
331
+ if (LISTING_URL_RE.test(path2)) return "listing";
332
+ if (BLOG_URL_RE.test(path2)) return "blog";
333
+ return void 0;
334
+ }
335
+ function collectJsonLdNodes2(data) {
336
+ const nodes = [];
337
+ const visit = (value) => {
338
+ if (value == null) return;
339
+ if (Array.isArray(value)) {
340
+ value.forEach(visit);
341
+ return;
342
+ }
343
+ if (typeof value !== "object") return;
344
+ const obj = value;
345
+ nodes.push(obj);
346
+ if (obj["@graph"]) visit(obj["@graph"]);
347
+ };
348
+ visit(data);
349
+ return nodes;
350
+ }
351
+ function schemaTypeName(node) {
352
+ const type = node["@type"];
353
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
354
+ const raw = types[0];
355
+ if (raw == null) return "";
356
+ const s = String(raw).toLowerCase();
357
+ const slash = s.lastIndexOf("/");
358
+ return slash >= 0 ? s.slice(slash + 1) : s;
359
+ }
360
+ function inferTypeFromSchema(html) {
361
+ const $ = cheerio2.load(html);
362
+ for (const el of $('script[type="application/ld+json"]').toArray()) {
363
+ const raw = $(el).html()?.trim();
364
+ if (!raw) continue;
365
+ try {
366
+ const parsed = JSON.parse(raw);
367
+ for (const node of collectJsonLdNodes2(parsed)) {
368
+ const name = schemaTypeName(node);
369
+ if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
370
+ if (name === "product" || node.offers != null) return "detail";
371
+ }
372
+ } catch {
373
+ }
374
+ }
375
+ const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
376
+ if (ogType === "product") return "detail";
377
+ return void 0;
378
+ }
379
+ function normalizePageType(raw) {
380
+ if (!raw) return "page";
381
+ const lower = raw.toLowerCase();
382
+ const known = [
383
+ "detail",
384
+ "listing",
385
+ "amenity",
386
+ "promotion",
387
+ "contact",
388
+ "content",
389
+ "blog",
390
+ "system",
391
+ "page"
392
+ ];
393
+ if (known.includes(lower)) return lower;
394
+ if (lower === "room" || lower === "product") return "detail";
395
+ if (lower === "offer" || lower === "sale") return "promotion";
396
+ return raw;
397
+ }
398
+ function resolveDisplayTitle(input) {
399
+ const heading = input.headingTitle?.trim();
400
+ if (heading) return normalizeDisplayTitle(heading);
401
+ return normalizeDisplayTitle(input.title);
402
+ }
403
+ function resolvePageCardMetadata(input) {
404
+ const title = input.title?.trim();
405
+ const url = input.url;
406
+ const displayTitle = resolveDisplayTitle(input);
407
+ if (hardExcludePage(url, title)) {
408
+ return {
409
+ type: "system",
410
+ cardEligible: false,
411
+ cardPriority: 0,
412
+ displayTitle,
413
+ displayDescription: input.description,
414
+ displayImageUrl: input.imageUrl
415
+ };
416
+ }
417
+ let type = normalizePageType(input.type);
418
+ if (type === "page" && input.html) {
419
+ const fromSchema = inferTypeFromSchema(input.html);
420
+ if (fromSchema) type = fromSchema;
421
+ }
422
+ if (type === "page") {
423
+ const fromUrl = inferTypeFromUrl(url);
424
+ if (fromUrl) type = fromUrl;
425
+ }
426
+ if (input.hasProductPrice && type === "page") {
427
+ type = "detail";
428
+ }
429
+ const typeKey = String(type).toLowerCase();
430
+ let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
431
+ let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
432
+ if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
433
+ cardEligible = false;
434
+ }
435
+ return {
436
+ type,
437
+ cardEligible,
438
+ cardPriority,
439
+ displayTitle,
440
+ displayDescription: input.description?.trim() || void 0,
441
+ displayImageUrl: input.imageUrl
442
+ };
443
+ }
444
+
219
445
  // src/htmlPageExtract.ts
220
446
  var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
221
447
  var DEFAULT_REMOVE_SELECTORS = [
@@ -238,17 +464,23 @@ function cleanContent(text) {
238
464
  return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
239
465
  }
240
466
  function bodyTextLengthHint(html, options = {}) {
241
- const $ = cheerio2.load(html);
467
+ const $ = cheerio3.load(html);
242
468
  stripNoiseFromDom($, options);
243
469
  return cleanContent($("body").text().trim()).length;
244
470
  }
245
471
  function extractPageFromHtml(url, html, options = {}) {
246
- const $ = cheerio2.load(html);
472
+ const $ = cheerio3.load(html);
247
473
  stripNoiseFromDom($, options);
248
- const titleSelector = options.titleSelector || "h1, title";
249
- let title = $(titleSelector).first().text().trim();
474
+ const h1Title = $("h1").first().text().trim();
475
+ const docTitle = $("title").text().trim();
476
+ let title = "";
477
+ if (options.titleSelector) {
478
+ title = $(options.titleSelector).first().text().trim();
479
+ } else {
480
+ title = docTitle || h1Title;
481
+ }
250
482
  if (!title) {
251
- title = $("title").text().trim();
483
+ title = h1Title || docTitle;
252
484
  }
253
485
  const content = extractBestContentText($, options);
254
486
  const minChars = options.minExtractedContentLength ?? 50;
@@ -273,12 +505,27 @@ function extractPageFromHtml(url, html, options = {}) {
273
505
  }
274
506
  }
275
507
  const productMeta = extractProductMetadata(html);
276
- const metadata = {
508
+ const cardMeta = resolvePageCardMetadata({
509
+ url,
510
+ title,
511
+ headingTitle: h1Title || void 0,
512
+ description,
513
+ imageUrl,
514
+ html,
277
515
  type,
516
+ hasProductPrice: productMeta.price != null
517
+ });
518
+ const metadata = {
519
+ type: cardMeta.type,
520
+ cardEligible: cardMeta.cardEligible,
521
+ cardPriority: cardMeta.cardPriority,
278
522
  ...title ? { title } : {},
523
+ ...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
279
524
  url,
280
525
  ...imageUrl ? { imageUrl } : {},
526
+ ...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
281
527
  ...description ? { description } : {},
528
+ ...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
282
529
  ...productMeta.price != null ? { price: productMeta.price } : {},
283
530
  ...productMeta.currency ? { currency: productMeta.currency } : {},
284
531
  ...productMeta.availability ? { availability: productMeta.availability } : {},
@@ -584,13 +831,18 @@ var WebRAGPlugin = class {
584
831
  plugin: this.name,
585
832
  contentCount: scoredResults.length,
586
833
  types: [...new Set(scoredResults.map((d) => d.metadata.type))],
587
- topResults: scoredResults.slice(0, 5).map((doc) => ({
834
+ topResults: scoredResults.slice(0, 16).map((doc) => ({
588
835
  id: doc.id,
589
836
  type: doc.metadata.type,
590
837
  title: doc.metadata.title,
591
838
  url: doc.metadata.url,
592
839
  imageUrl: doc.metadata.imageUrl,
593
840
  description: doc.metadata.description,
841
+ cardEligible: doc.metadata.cardEligible,
842
+ cardPriority: doc.metadata.cardPriority,
843
+ displayTitle: doc.metadata.displayTitle,
844
+ displayDescription: doc.metadata.displayDescription,
845
+ displayImageUrl: doc.metadata.displayImageUrl,
594
846
  ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
595
847
  ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
596
848
  ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
@@ -1625,7 +1877,9 @@ var WebRAGPlugin = class {
1625
1877
  render: config.render,
1626
1878
  renderOptions: config.renderOptions,
1627
1879
  debug: config.debug,
1628
- crawlLedger: config.crawlLedger
1880
+ crawlLedger: config.crawlLedger,
1881
+ extractLinks: config.extractLinks,
1882
+ maxLinksPerPage: config.maxLinksPerPage
1629
1883
  }, options);
1630
1884
  return {
1631
1885
  ...result,
@@ -1764,7 +2018,7 @@ var WebRAGPlugin = class {
1764
2018
  return await response.text();
1765
2019
  }
1766
2020
  extractInternalLinks(html, base, stripQueryParams) {
1767
- const $ = cheerio3.load(html);
2021
+ const $ = cheerio4.load(html);
1768
2022
  const links = /* @__PURE__ */ new Set();
1769
2023
  $("a[href]").each((_, el) => {
1770
2024
  const href = ($(el).attr("href") || "").trim();
@@ -1781,6 +2035,22 @@ var WebRAGPlugin = class {
1781
2035
  });
1782
2036
  return Array.from(links);
1783
2037
  }
2038
+ /**
2039
+ * When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
2040
+ * caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
2041
+ * undefined when disabled or on any parse error (link extraction must never fail a crawl).
2042
+ */
2043
+ extractLinksIfEnabled(url, html, config) {
2044
+ if (!config.extractLinks) return void 0;
2045
+ try {
2046
+ const base = new URL(url);
2047
+ const links = this.extractInternalLinks(html, base, config.stripQueryParams ?? false);
2048
+ const cap = config.maxLinksPerPage ?? 200;
2049
+ return links.length > cap ? links.slice(0, cap) : links;
2050
+ } catch {
2051
+ return void 0;
2052
+ }
2053
+ }
1784
2054
  /**
1785
2055
  * Ingest content from a list of URLs
1786
2056
  *
@@ -1811,7 +2081,9 @@ var WebRAGPlugin = class {
1811
2081
  render: config.render,
1812
2082
  renderOptions: config.renderOptions,
1813
2083
  debug: config.debug,
1814
- crawlLedger: config.crawlLedger
2084
+ crawlLedger: config.crawlLedger,
2085
+ extractLinks: config.extractLinks,
2086
+ maxLinksPerPage: config.maxLinksPerPage
1815
2087
  }, options);
1816
2088
  }
1817
2089
  /**
@@ -1930,7 +2202,7 @@ var WebRAGPlugin = class {
1930
2202
  }
1931
2203
  }
1932
2204
  try {
1933
- const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
2205
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2, links } = await this.crawlPageSmart(url, config, timeout, {
1934
2206
  renderMode,
1935
2207
  renderOptions,
1936
2208
  minContentLength,
@@ -1964,7 +2236,8 @@ var WebRAGPlugin = class {
1964
2236
  bodyTextLengthHint: bodyTextLengthHint2,
1965
2237
  title: doc?.metadata?.title,
1966
2238
  docId: doc?.id,
1967
- error: diag?.errorMessage
2239
+ error: diag?.errorMessage,
2240
+ ...links ? { links } : {}
1968
2241
  });
1969
2242
  this.emitCrawlPage(config, {
1970
2243
  url,
@@ -2102,41 +2375,39 @@ var WebRAGPlugin = class {
2102
2375
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
2103
2376
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
2104
2377
  }
2105
- diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
2378
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed, links) {
2106
2379
  if (blockedSuspected) {
2107
2380
  return {
2108
2381
  doc: null,
2109
- diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
2382
+ diag: { modeUsed: modeFailed, reason: "blocked_suspected" },
2383
+ links
2110
2384
  };
2111
2385
  }
2112
2386
  if (renderFailure) {
2113
2387
  return {
2114
2388
  doc: null,
2115
- diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
2389
+ diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure },
2390
+ links
2116
2391
  };
2117
2392
  }
2118
2393
  return {
2119
2394
  doc,
2120
2395
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
2121
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
2396
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2,
2397
+ links
2122
2398
  };
2123
2399
  }
2124
2400
  async crawlPageSmart(url, config, timeout, ctx) {
2125
2401
  if (ctx.renderMode === true) {
2126
- const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2127
- url,
2128
- config,
2129
- timeout,
2130
- ctx.renderOptions,
2131
- ctx.dbg
2132
- );
2402
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected, links } = await this.crawlPageRendered(url, config, timeout, ctx.renderOptions, ctx.dbg);
2133
2403
  return this.diagFromRenderedAttempt(
2134
2404
  doc,
2135
2405
  bodyTextLengthHint2,
2136
2406
  renderFailure,
2137
2407
  blockedSuspected,
2138
2408
  "render_ok",
2139
- "render_failed"
2409
+ "render_failed",
2410
+ links
2140
2411
  );
2141
2412
  }
2142
2413
  try {
@@ -2162,8 +2433,9 @@ var WebRAGPlugin = class {
2162
2433
  const html = await response.text();
2163
2434
  const doc = this.extractDocumentFromHtml(url, html, config);
2164
2435
  const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
2436
+ const staticLinks = this.extractLinksIfEnabled(url, html, config);
2165
2437
  if (doc && doc.content.length >= ctx.minContentLength) {
2166
- return { doc, diag: { modeUsed: "static_ok" } };
2438
+ return { doc, diag: { modeUsed: "static_ok" }, links: staticLinks };
2167
2439
  }
2168
2440
  if (ctx.renderMode === "auto") {
2169
2441
  const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
@@ -2177,7 +2449,8 @@ var WebRAGPlugin = class {
2177
2449
  doc: rendered,
2178
2450
  bodyTextLengthHint: rHint,
2179
2451
  renderFailure,
2180
- blockedSuspected
2452
+ blockedSuspected,
2453
+ links: renderedLinks
2181
2454
  } = await this.crawlPageRendered(
2182
2455
  url,
2183
2456
  config,
@@ -2192,7 +2465,9 @@ var WebRAGPlugin = class {
2192
2465
  renderFailure,
2193
2466
  blockedSuspected,
2194
2467
  "render_fallback_ok",
2195
- "render_fallback_failed"
2468
+ "render_fallback_failed",
2469
+ // Prefer links from the rendered DOM; fall back to the static HTML's links.
2470
+ renderedLinks ?? staticLinks
2196
2471
  );
2197
2472
  if (!rendered && (renderFailure || blockedSuspected)) {
2198
2473
  fb.bodyTextLengthHint = staticHint ?? rHint;
@@ -2203,7 +2478,8 @@ var WebRAGPlugin = class {
2203
2478
  return {
2204
2479
  doc: null,
2205
2480
  diag: { modeUsed: "static_failed", reason: "too_small" },
2206
- bodyTextLengthHint: staticHint
2481
+ bodyTextLengthHint: staticHint,
2482
+ links: staticLinks
2207
2483
  };
2208
2484
  } catch (e) {
2209
2485
  throw e;
@@ -2251,6 +2527,7 @@ var WebRAGPlugin = class {
2251
2527
  const html = await page.content();
2252
2528
  const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
2253
2529
  const doc = this.extractDocumentFromHtml(url, html, config);
2530
+ const links = this.extractLinksIfEnabled(url, html, config);
2254
2531
  if (config.debug?.saveDir && config.debug?.enabled) {
2255
2532
  try {
2256
2533
  const saveDir = config.debug.saveDir;
@@ -2264,7 +2541,7 @@ var WebRAGPlugin = class {
2264
2541
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
2265
2542
  }
2266
2543
  }
2267
- return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
2544
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2, links };
2268
2545
  } catch (e) {
2269
2546
  const msg = String(e?.message || e || "render_failed");
2270
2547
  const lower = msg.toLowerCase();
@@ -2631,8 +2908,12 @@ var WebRAGPlugin = class {
2631
2908
  bodyTextLengthHint,
2632
2909
  extractPageFromHtml,
2633
2910
  extractProductMetadata,
2911
+ hardExcludePage,
2912
+ inferTypeFromUrl,
2634
2913
  normalizeAvailability,
2635
2914
  normalizeCurrency,
2915
+ normalizeDisplayTitle,
2636
2916
  parsePrice,
2917
+ resolvePageCardMetadata,
2637
2918
  urlToDocumentId
2638
2919
  });
package/dist/index.mjs CHANGED
@@ -1,12 +1,12 @@
1
1
  // src/WebRAGPlugin.ts
2
2
  import { MongoClient } from "mongodb";
3
3
  import OpenAI from "openai";
4
- import * as cheerio3 from "cheerio";
4
+ import * as cheerio4 from "cheerio";
5
5
  import * as fs from "fs";
6
6
  import * as path from "path";
7
7
 
8
8
  // src/htmlPageExtract.ts
9
- import * as cheerio2 from "cheerio";
9
+ import * as cheerio3 from "cheerio";
10
10
 
11
11
  // src/productMetadata.ts
12
12
  import * as cheerio from "cheerio";
@@ -173,6 +173,228 @@ function normalizeAvailability(value) {
173
173
  return s.replace(/\s+/g, "") || void 0;
174
174
  }
175
175
 
176
+ // src/pageCardMetadata.ts
177
+ import * as cheerio2 from "cheerio";
178
+ var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
179
+ var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
180
+ var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
181
+ var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
182
+ var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
183
+ var ENTITY_DETAIL_PATH_RES = [
184
+ /\/projects\/[^/]+/i,
185
+ /\/project\/[^/]+/i,
186
+ /\/perspectives\/[^/]+/i,
187
+ /\/perspective\/[^/]+/i,
188
+ /\/portfolio\/[^/]+/i,
189
+ /\/case-stud(?:y|ies)\/[^/]+/i,
190
+ /\/insights?\/[^/]+/i,
191
+ /\/people\/[^/]+/i,
192
+ /\/person\/[^/]+/i,
193
+ /\/team-members?\/[^/]+/i,
194
+ /\/members?\/[^/]+/i,
195
+ /\/staff\/[^/]+/i,
196
+ /\/experts?\/[^/]+/i,
197
+ /\/authors?\/[^/]+/i,
198
+ /\/leadership\/[^/]+/i,
199
+ /\/biograph(?:y|ies)\/[^/]+/i
200
+ ];
201
+ var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
202
+ var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
203
+ var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
204
+ var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
205
+ var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
206
+ var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
207
+ var CARD_PRIORITY = {
208
+ detail: 10,
209
+ listing: 6,
210
+ amenity: 5,
211
+ promotion: 2,
212
+ contact: 1,
213
+ content: 1,
214
+ blog: 0,
215
+ system: 0,
216
+ page: 3
217
+ };
218
+ var CARD_ELIGIBLE_DEFAULT = {
219
+ detail: true,
220
+ listing: true,
221
+ amenity: true,
222
+ promotion: false,
223
+ contact: false,
224
+ content: false,
225
+ blog: false,
226
+ system: false,
227
+ page: false
228
+ };
229
+ var SCHEMA_TYPE_MAP = {
230
+ product: "detail",
231
+ service: "amenity",
232
+ hotelroom: "detail",
233
+ room: "detail",
234
+ apartment: "detail",
235
+ lodgingroom: "detail",
236
+ course: "detail",
237
+ event: "detail",
238
+ offer: "promotion",
239
+ person: "detail",
240
+ employee: "detail",
241
+ profilepage: "detail",
242
+ article: "detail",
243
+ newsarticle: "detail",
244
+ blogposting: "detail",
245
+ creativework: "detail"
246
+ };
247
+ function normalizeDisplayTitle(title) {
248
+ if (!title?.trim()) return title;
249
+ let t = title.trim();
250
+ for (let i = 0; i < 2; i++) {
251
+ const dash = t.match(EN_DASH_SUFFIX_RE);
252
+ if (dash && dash.index !== void 0 && dash.index >= 4) {
253
+ t = t.slice(0, dash.index).trim();
254
+ continue;
255
+ }
256
+ const pipe = t.match(PIPE_SUFFIX_RE);
257
+ if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
258
+ t = t.slice(0, pipe.index).trim();
259
+ continue;
260
+ }
261
+ break;
262
+ }
263
+ return t || title.trim();
264
+ }
265
+ function hardExcludePage(url, title) {
266
+ const path2 = url.toLowerCase();
267
+ if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
268
+ if (BLOG_URL_RE.test(path2)) return true;
269
+ if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
270
+ try {
271
+ const u = new URL(url);
272
+ if (u.pathname === "/" || u.pathname === "") return true;
273
+ } catch {
274
+ }
275
+ return false;
276
+ }
277
+ function inferTypeFromUrl(url) {
278
+ const path2 = url.toLowerCase();
279
+ if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
280
+ if (CONTACT_URL_RE.test(path2)) return "contact";
281
+ if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
282
+ if (AMENITY_URL_RE.test(path2)) return "amenity";
283
+ if (DETAIL_URL_RE.test(path2)) return "detail";
284
+ if (LISTING_URL_RE.test(path2)) return "listing";
285
+ if (BLOG_URL_RE.test(path2)) return "blog";
286
+ return void 0;
287
+ }
288
+ function collectJsonLdNodes2(data) {
289
+ const nodes = [];
290
+ const visit = (value) => {
291
+ if (value == null) return;
292
+ if (Array.isArray(value)) {
293
+ value.forEach(visit);
294
+ return;
295
+ }
296
+ if (typeof value !== "object") return;
297
+ const obj = value;
298
+ nodes.push(obj);
299
+ if (obj["@graph"]) visit(obj["@graph"]);
300
+ };
301
+ visit(data);
302
+ return nodes;
303
+ }
304
+ function schemaTypeName(node) {
305
+ const type = node["@type"];
306
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
307
+ const raw = types[0];
308
+ if (raw == null) return "";
309
+ const s = String(raw).toLowerCase();
310
+ const slash = s.lastIndexOf("/");
311
+ return slash >= 0 ? s.slice(slash + 1) : s;
312
+ }
313
+ function inferTypeFromSchema(html) {
314
+ const $ = cheerio2.load(html);
315
+ for (const el of $('script[type="application/ld+json"]').toArray()) {
316
+ const raw = $(el).html()?.trim();
317
+ if (!raw) continue;
318
+ try {
319
+ const parsed = JSON.parse(raw);
320
+ for (const node of collectJsonLdNodes2(parsed)) {
321
+ const name = schemaTypeName(node);
322
+ if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
323
+ if (name === "product" || node.offers != null) return "detail";
324
+ }
325
+ } catch {
326
+ }
327
+ }
328
+ const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
329
+ if (ogType === "product") return "detail";
330
+ return void 0;
331
+ }
332
+ function normalizePageType(raw) {
333
+ if (!raw) return "page";
334
+ const lower = raw.toLowerCase();
335
+ const known = [
336
+ "detail",
337
+ "listing",
338
+ "amenity",
339
+ "promotion",
340
+ "contact",
341
+ "content",
342
+ "blog",
343
+ "system",
344
+ "page"
345
+ ];
346
+ if (known.includes(lower)) return lower;
347
+ if (lower === "room" || lower === "product") return "detail";
348
+ if (lower === "offer" || lower === "sale") return "promotion";
349
+ return raw;
350
+ }
351
+ function resolveDisplayTitle(input) {
352
+ const heading = input.headingTitle?.trim();
353
+ if (heading) return normalizeDisplayTitle(heading);
354
+ return normalizeDisplayTitle(input.title);
355
+ }
356
+ function resolvePageCardMetadata(input) {
357
+ const title = input.title?.trim();
358
+ const url = input.url;
359
+ const displayTitle = resolveDisplayTitle(input);
360
+ if (hardExcludePage(url, title)) {
361
+ return {
362
+ type: "system",
363
+ cardEligible: false,
364
+ cardPriority: 0,
365
+ displayTitle,
366
+ displayDescription: input.description,
367
+ displayImageUrl: input.imageUrl
368
+ };
369
+ }
370
+ let type = normalizePageType(input.type);
371
+ if (type === "page" && input.html) {
372
+ const fromSchema = inferTypeFromSchema(input.html);
373
+ if (fromSchema) type = fromSchema;
374
+ }
375
+ if (type === "page") {
376
+ const fromUrl = inferTypeFromUrl(url);
377
+ if (fromUrl) type = fromUrl;
378
+ }
379
+ if (input.hasProductPrice && type === "page") {
380
+ type = "detail";
381
+ }
382
+ const typeKey = String(type).toLowerCase();
383
+ let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
384
+ let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
385
+ if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
386
+ cardEligible = false;
387
+ }
388
+ return {
389
+ type,
390
+ cardEligible,
391
+ cardPriority,
392
+ displayTitle,
393
+ displayDescription: input.description?.trim() || void 0,
394
+ displayImageUrl: input.imageUrl
395
+ };
396
+ }
397
+
176
398
  // src/htmlPageExtract.ts
177
399
  var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
178
400
  var DEFAULT_REMOVE_SELECTORS = [
@@ -195,17 +417,23 @@ function cleanContent(text) {
195
417
  return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
196
418
  }
197
419
  function bodyTextLengthHint(html, options = {}) {
198
- const $ = cheerio2.load(html);
420
+ const $ = cheerio3.load(html);
199
421
  stripNoiseFromDom($, options);
200
422
  return cleanContent($("body").text().trim()).length;
201
423
  }
202
424
  function extractPageFromHtml(url, html, options = {}) {
203
- const $ = cheerio2.load(html);
425
+ const $ = cheerio3.load(html);
204
426
  stripNoiseFromDom($, options);
205
- const titleSelector = options.titleSelector || "h1, title";
206
- let title = $(titleSelector).first().text().trim();
427
+ const h1Title = $("h1").first().text().trim();
428
+ const docTitle = $("title").text().trim();
429
+ let title = "";
430
+ if (options.titleSelector) {
431
+ title = $(options.titleSelector).first().text().trim();
432
+ } else {
433
+ title = docTitle || h1Title;
434
+ }
207
435
  if (!title) {
208
- title = $("title").text().trim();
436
+ title = h1Title || docTitle;
209
437
  }
210
438
  const content = extractBestContentText($, options);
211
439
  const minChars = options.minExtractedContentLength ?? 50;
@@ -230,12 +458,27 @@ function extractPageFromHtml(url, html, options = {}) {
230
458
  }
231
459
  }
232
460
  const productMeta = extractProductMetadata(html);
233
- const metadata = {
461
+ const cardMeta = resolvePageCardMetadata({
462
+ url,
463
+ title,
464
+ headingTitle: h1Title || void 0,
465
+ description,
466
+ imageUrl,
467
+ html,
234
468
  type,
469
+ hasProductPrice: productMeta.price != null
470
+ });
471
+ const metadata = {
472
+ type: cardMeta.type,
473
+ cardEligible: cardMeta.cardEligible,
474
+ cardPriority: cardMeta.cardPriority,
235
475
  ...title ? { title } : {},
476
+ ...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
236
477
  url,
237
478
  ...imageUrl ? { imageUrl } : {},
479
+ ...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
238
480
  ...description ? { description } : {},
481
+ ...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
239
482
  ...productMeta.price != null ? { price: productMeta.price } : {},
240
483
  ...productMeta.currency ? { currency: productMeta.currency } : {},
241
484
  ...productMeta.availability ? { availability: productMeta.availability } : {},
@@ -541,13 +784,18 @@ var WebRAGPlugin = class {
541
784
  plugin: this.name,
542
785
  contentCount: scoredResults.length,
543
786
  types: [...new Set(scoredResults.map((d) => d.metadata.type))],
544
- topResults: scoredResults.slice(0, 5).map((doc) => ({
787
+ topResults: scoredResults.slice(0, 16).map((doc) => ({
545
788
  id: doc.id,
546
789
  type: doc.metadata.type,
547
790
  title: doc.metadata.title,
548
791
  url: doc.metadata.url,
549
792
  imageUrl: doc.metadata.imageUrl,
550
793
  description: doc.metadata.description,
794
+ cardEligible: doc.metadata.cardEligible,
795
+ cardPriority: doc.metadata.cardPriority,
796
+ displayTitle: doc.metadata.displayTitle,
797
+ displayDescription: doc.metadata.displayDescription,
798
+ displayImageUrl: doc.metadata.displayImageUrl,
551
799
  ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
552
800
  ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
553
801
  ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
@@ -1582,7 +1830,9 @@ var WebRAGPlugin = class {
1582
1830
  render: config.render,
1583
1831
  renderOptions: config.renderOptions,
1584
1832
  debug: config.debug,
1585
- crawlLedger: config.crawlLedger
1833
+ crawlLedger: config.crawlLedger,
1834
+ extractLinks: config.extractLinks,
1835
+ maxLinksPerPage: config.maxLinksPerPage
1586
1836
  }, options);
1587
1837
  return {
1588
1838
  ...result,
@@ -1721,7 +1971,7 @@ var WebRAGPlugin = class {
1721
1971
  return await response.text();
1722
1972
  }
1723
1973
  extractInternalLinks(html, base, stripQueryParams) {
1724
- const $ = cheerio3.load(html);
1974
+ const $ = cheerio4.load(html);
1725
1975
  const links = /* @__PURE__ */ new Set();
1726
1976
  $("a[href]").each((_, el) => {
1727
1977
  const href = ($(el).attr("href") || "").trim();
@@ -1738,6 +1988,22 @@ var WebRAGPlugin = class {
1738
1988
  });
1739
1989
  return Array.from(links);
1740
1990
  }
1991
+ /**
1992
+ * When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
1993
+ * caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
1994
+ * undefined when disabled or on any parse error (link extraction must never fail a crawl).
1995
+ */
1996
+ extractLinksIfEnabled(url, html, config) {
1997
+ if (!config.extractLinks) return void 0;
1998
+ try {
1999
+ const base = new URL(url);
2000
+ const links = this.extractInternalLinks(html, base, config.stripQueryParams ?? false);
2001
+ const cap = config.maxLinksPerPage ?? 200;
2002
+ return links.length > cap ? links.slice(0, cap) : links;
2003
+ } catch {
2004
+ return void 0;
2005
+ }
2006
+ }
1741
2007
  /**
1742
2008
  * Ingest content from a list of URLs
1743
2009
  *
@@ -1768,7 +2034,9 @@ var WebRAGPlugin = class {
1768
2034
  render: config.render,
1769
2035
  renderOptions: config.renderOptions,
1770
2036
  debug: config.debug,
1771
- crawlLedger: config.crawlLedger
2037
+ crawlLedger: config.crawlLedger,
2038
+ extractLinks: config.extractLinks,
2039
+ maxLinksPerPage: config.maxLinksPerPage
1772
2040
  }, options);
1773
2041
  }
1774
2042
  /**
@@ -1887,7 +2155,7 @@ var WebRAGPlugin = class {
1887
2155
  }
1888
2156
  }
1889
2157
  try {
1890
- const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
2158
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2, links } = await this.crawlPageSmart(url, config, timeout, {
1891
2159
  renderMode,
1892
2160
  renderOptions,
1893
2161
  minContentLength,
@@ -1921,7 +2189,8 @@ var WebRAGPlugin = class {
1921
2189
  bodyTextLengthHint: bodyTextLengthHint2,
1922
2190
  title: doc?.metadata?.title,
1923
2191
  docId: doc?.id,
1924
- error: diag?.errorMessage
2192
+ error: diag?.errorMessage,
2193
+ ...links ? { links } : {}
1925
2194
  });
1926
2195
  this.emitCrawlPage(config, {
1927
2196
  url,
@@ -2059,41 +2328,39 @@ var WebRAGPlugin = class {
2059
2328
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
2060
2329
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
2061
2330
  }
2062
- diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
2331
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed, links) {
2063
2332
  if (blockedSuspected) {
2064
2333
  return {
2065
2334
  doc: null,
2066
- diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
2335
+ diag: { modeUsed: modeFailed, reason: "blocked_suspected" },
2336
+ links
2067
2337
  };
2068
2338
  }
2069
2339
  if (renderFailure) {
2070
2340
  return {
2071
2341
  doc: null,
2072
- diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
2342
+ diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure },
2343
+ links
2073
2344
  };
2074
2345
  }
2075
2346
  return {
2076
2347
  doc,
2077
2348
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
2078
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
2349
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2,
2350
+ links
2079
2351
  };
2080
2352
  }
2081
2353
  async crawlPageSmart(url, config, timeout, ctx) {
2082
2354
  if (ctx.renderMode === true) {
2083
- const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2084
- url,
2085
- config,
2086
- timeout,
2087
- ctx.renderOptions,
2088
- ctx.dbg
2089
- );
2355
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected, links } = await this.crawlPageRendered(url, config, timeout, ctx.renderOptions, ctx.dbg);
2090
2356
  return this.diagFromRenderedAttempt(
2091
2357
  doc,
2092
2358
  bodyTextLengthHint2,
2093
2359
  renderFailure,
2094
2360
  blockedSuspected,
2095
2361
  "render_ok",
2096
- "render_failed"
2362
+ "render_failed",
2363
+ links
2097
2364
  );
2098
2365
  }
2099
2366
  try {
@@ -2119,8 +2386,9 @@ var WebRAGPlugin = class {
2119
2386
  const html = await response.text();
2120
2387
  const doc = this.extractDocumentFromHtml(url, html, config);
2121
2388
  const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
2389
+ const staticLinks = this.extractLinksIfEnabled(url, html, config);
2122
2390
  if (doc && doc.content.length >= ctx.minContentLength) {
2123
- return { doc, diag: { modeUsed: "static_ok" } };
2391
+ return { doc, diag: { modeUsed: "static_ok" }, links: staticLinks };
2124
2392
  }
2125
2393
  if (ctx.renderMode === "auto") {
2126
2394
  const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
@@ -2134,7 +2402,8 @@ var WebRAGPlugin = class {
2134
2402
  doc: rendered,
2135
2403
  bodyTextLengthHint: rHint,
2136
2404
  renderFailure,
2137
- blockedSuspected
2405
+ blockedSuspected,
2406
+ links: renderedLinks
2138
2407
  } = await this.crawlPageRendered(
2139
2408
  url,
2140
2409
  config,
@@ -2149,7 +2418,9 @@ var WebRAGPlugin = class {
2149
2418
  renderFailure,
2150
2419
  blockedSuspected,
2151
2420
  "render_fallback_ok",
2152
- "render_fallback_failed"
2421
+ "render_fallback_failed",
2422
+ // Prefer links from the rendered DOM; fall back to the static HTML's links.
2423
+ renderedLinks ?? staticLinks
2153
2424
  );
2154
2425
  if (!rendered && (renderFailure || blockedSuspected)) {
2155
2426
  fb.bodyTextLengthHint = staticHint ?? rHint;
@@ -2160,7 +2431,8 @@ var WebRAGPlugin = class {
2160
2431
  return {
2161
2432
  doc: null,
2162
2433
  diag: { modeUsed: "static_failed", reason: "too_small" },
2163
- bodyTextLengthHint: staticHint
2434
+ bodyTextLengthHint: staticHint,
2435
+ links: staticLinks
2164
2436
  };
2165
2437
  } catch (e) {
2166
2438
  throw e;
@@ -2208,6 +2480,7 @@ var WebRAGPlugin = class {
2208
2480
  const html = await page.content();
2209
2481
  const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
2210
2482
  const doc = this.extractDocumentFromHtml(url, html, config);
2483
+ const links = this.extractLinksIfEnabled(url, html, config);
2211
2484
  if (config.debug?.saveDir && config.debug?.enabled) {
2212
2485
  try {
2213
2486
  const saveDir = config.debug.saveDir;
@@ -2221,7 +2494,7 @@ var WebRAGPlugin = class {
2221
2494
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
2222
2495
  }
2223
2496
  }
2224
- return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
2497
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2, links };
2225
2498
  } catch (e) {
2226
2499
  const msg = String(e?.message || e || "render_failed");
2227
2500
  const lower = msg.toLowerCase();
@@ -2587,8 +2860,12 @@ export {
2587
2860
  bodyTextLengthHint,
2588
2861
  extractPageFromHtml,
2589
2862
  extractProductMetadata,
2863
+ hardExcludePage,
2864
+ inferTypeFromUrl,
2590
2865
  normalizeAvailability,
2591
2866
  normalizeCurrency,
2867
+ normalizeDisplayTitle,
2592
2868
  parsePrice,
2869
+ resolvePageCardMetadata,
2593
2870
  urlToDocumentId
2594
2871
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.6",
3
+ "version": "0.1.8",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",
@@ -68,4 +68,3 @@
68
68
  "url": "https://github.com/vilo-hq/snap-agent/issues"
69
69
  }
70
70
  }
71
-