@snap-agent/rag-web 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
118
118
  interface CrawlLedgerDocument {
119
119
  tenantId: string;
120
120
  agentId: string;
121
+ /** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
122
+ ingestionId?: string;
121
123
  urlNormalized: string;
122
124
  url: string;
123
125
  domain: string;
@@ -369,10 +371,46 @@ interface RSSConfig {
369
371
  /**
370
372
  * Crawl result for sitemap/URL crawling
371
373
  */
374
+ type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
375
+ /** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
376
+ interface CrawlProgressUpdate {
377
+ phase: CrawlProgressPhase;
378
+ /** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
379
+ urlsDiscovered?: number;
380
+ /** URLs that will be crawled in this run (≤ maxPages). */
381
+ urlsScheduled?: number;
382
+ /** During crawl: batches done. During indexing: documents fully embedded so far. */
383
+ pagesProcessed?: number;
384
+ /** During indexing: total text chunks to embed (drives web_content writes). */
385
+ chunksTotal?: number;
386
+ /** During indexing: chunks embedded so far. */
387
+ chunksProcessed?: number;
388
+ }
389
+ type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
390
+ type BulkProgressPhase = 'processing' | 'indexing';
391
+ /** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
392
+ interface BulkProgressUpdate {
393
+ phase: BulkProgressPhase;
394
+ opsTotal: number;
395
+ opsDone: number;
396
+ currentOpType?: 'insert' | 'update' | 'delete';
397
+ currentUrl?: string;
398
+ }
399
+ type BulkProgressCallback = (update: BulkProgressUpdate) => void;
400
+ /** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
401
+ type CrawlPageEvent = {
402
+ url: string;
403
+ event: 'start' | 'done';
404
+ status?: string;
405
+ error?: string;
406
+ };
407
+ type CrawlPageCallback = (event: CrawlPageEvent) => void;
372
408
  interface CrawlResult extends WebIngestResult {
373
409
  urlsCrawled: number;
374
410
  urlsSkipped: number;
375
411
  urlsFailed: number;
412
+ /** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
413
+ urlsScheduled?: number;
376
414
  crawledAt: Date;
377
415
  }
378
416
  /**
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
423
461
  private cacheStats;
424
462
  constructor(config: WebRAGConfig);
425
463
  private getCollection;
464
+ private ledgerIndexesEnsured;
426
465
  private getLedgerCollection;
427
466
  /**
428
467
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
430
469
  listCrawlLedger(options?: {
431
470
  agentId?: string;
432
471
  domain?: string;
472
+ ingestionId?: string;
433
473
  status?: CrawlLedgerStatus;
434
474
  limit?: number;
435
475
  skip?: number;
@@ -660,11 +700,19 @@ declare class WebRAGPlugin implements RAGPlugin {
660
700
  private extractBestContentText;
661
701
  private bodyTextLengthHint;
662
702
  private extractDocumentFromHtml;
703
+ /**
704
+ * Fallback image extraction: finds the first meaningful image in the content area.
705
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
706
+ */
707
+ private extractHeroImage;
663
708
  private looksLikeDynamicShell;
664
709
  private diagFromRenderedAttempt;
665
710
  private crawlPageSmart;
666
711
  private crawlPageRendered;
667
712
  private discoverSitemaps;
713
+ private emitBulkProgress;
714
+ private emitCrawlProgress;
715
+ private emitCrawlPage;
668
716
  private createDebugCollector;
669
717
  /**
670
718
  * Clean extracted text content
@@ -735,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
735
783
  getConfig(): Record<string, any>;
736
784
  }
737
785
 
738
- export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
786
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
package/dist/index.d.ts CHANGED
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
118
118
  interface CrawlLedgerDocument {
119
119
  tenantId: string;
120
120
  agentId: string;
121
+ /** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
122
+ ingestionId?: string;
121
123
  urlNormalized: string;
122
124
  url: string;
123
125
  domain: string;
@@ -369,10 +371,46 @@ interface RSSConfig {
369
371
  /**
370
372
  * Crawl result for sitemap/URL crawling
371
373
  */
374
+ type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
375
+ /** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
376
+ interface CrawlProgressUpdate {
377
+ phase: CrawlProgressPhase;
378
+ /** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
379
+ urlsDiscovered?: number;
380
+ /** URLs that will be crawled in this run (≤ maxPages). */
381
+ urlsScheduled?: number;
382
+ /** During crawl: batches done. During indexing: documents fully embedded so far. */
383
+ pagesProcessed?: number;
384
+ /** During indexing: total text chunks to embed (drives web_content writes). */
385
+ chunksTotal?: number;
386
+ /** During indexing: chunks embedded so far. */
387
+ chunksProcessed?: number;
388
+ }
389
+ type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
390
+ type BulkProgressPhase = 'processing' | 'indexing';
391
+ /** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
392
+ interface BulkProgressUpdate {
393
+ phase: BulkProgressPhase;
394
+ opsTotal: number;
395
+ opsDone: number;
396
+ currentOpType?: 'insert' | 'update' | 'delete';
397
+ currentUrl?: string;
398
+ }
399
+ type BulkProgressCallback = (update: BulkProgressUpdate) => void;
400
+ /** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
401
+ type CrawlPageEvent = {
402
+ url: string;
403
+ event: 'start' | 'done';
404
+ status?: string;
405
+ error?: string;
406
+ };
407
+ type CrawlPageCallback = (event: CrawlPageEvent) => void;
372
408
  interface CrawlResult extends WebIngestResult {
373
409
  urlsCrawled: number;
374
410
  urlsSkipped: number;
375
411
  urlsFailed: number;
412
+ /** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
413
+ urlsScheduled?: number;
376
414
  crawledAt: Date;
377
415
  }
378
416
  /**
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
423
461
  private cacheStats;
424
462
  constructor(config: WebRAGConfig);
425
463
  private getCollection;
464
+ private ledgerIndexesEnsured;
426
465
  private getLedgerCollection;
427
466
  /**
428
467
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
430
469
  listCrawlLedger(options?: {
431
470
  agentId?: string;
432
471
  domain?: string;
472
+ ingestionId?: string;
433
473
  status?: CrawlLedgerStatus;
434
474
  limit?: number;
435
475
  skip?: number;
@@ -660,11 +700,19 @@ declare class WebRAGPlugin implements RAGPlugin {
660
700
  private extractBestContentText;
661
701
  private bodyTextLengthHint;
662
702
  private extractDocumentFromHtml;
703
+ /**
704
+ * Fallback image extraction: finds the first meaningful image in the content area.
705
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
706
+ */
707
+ private extractHeroImage;
663
708
  private looksLikeDynamicShell;
664
709
  private diagFromRenderedAttempt;
665
710
  private crawlPageSmart;
666
711
  private crawlPageRendered;
667
712
  private discoverSitemaps;
713
+ private emitBulkProgress;
714
+ private emitCrawlProgress;
715
+ private emitCrawlPage;
668
716
  private createDebugCollector;
669
717
  /**
670
718
  * Clean extracted text content
@@ -735,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
735
783
  getConfig(): Record<string, any>;
736
784
  }
737
785
 
738
- export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
786
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
package/dist/index.js CHANGED
@@ -40,6 +40,24 @@ var import_openai = __toESM(require("openai"));
40
40
  var cheerio = __toESM(require("cheerio"));
41
41
  var fs = __toESM(require("fs"));
42
42
  var path = __toESM(require("path"));
43
+ function bulkOpCurrentUrl(op) {
44
+ const meta = op.document?.metadata;
45
+ if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
46
+ if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
47
+ return void 0;
48
+ }
49
+ function isUrlListingInsert(document) {
50
+ const meta = document.metadata;
51
+ if (meta?.type !== "url") return false;
52
+ const url = typeof meta.url === "string" ? meta.url.trim() : "";
53
+ if (!url) return false;
54
+ try {
55
+ const parsed = new URL(url);
56
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
57
+ } catch {
58
+ return false;
59
+ }
60
+ }
43
61
  var WebRAGPlugin = class _WebRAGPlugin {
44
62
  name = "web-rag";
45
63
  type = "rag";
@@ -78,6 +96,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
78
96
  }
79
97
  return this.db.collection(this.config.collection);
80
98
  }
99
+ ledgerIndexesEnsured = false;
81
100
  async getLedgerCollection() {
82
101
  if (!this.client) {
83
102
  this.client = new import_mongodb.MongoClient(this.config.mongoUri);
@@ -85,7 +104,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
85
104
  this.db = this.client.db(this.config.dbName);
86
105
  }
87
106
  const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
88
- return this.db.collection(name);
107
+ const col = this.db.collection(name);
108
+ if (!this.ledgerIndexesEnsured) {
109
+ this.ledgerIndexesEnsured = true;
110
+ await col.createIndex(
111
+ { tenantId: 1, agentId: 1, urlNormalized: 1 },
112
+ { unique: true }
113
+ );
114
+ await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
115
+ }
116
+ return col;
89
117
  }
90
118
  /**
91
119
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -95,6 +123,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
95
123
  const filter = { tenantId: this.config.tenantId };
96
124
  filter.agentId = options.agentId ?? "shared";
97
125
  if (options.domain) filter.domain = options.domain;
126
+ if (options.ingestionId) filter.ingestionId = options.ingestionId;
98
127
  if (options.status) filter.lastStatus = options.status;
99
128
  const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
100
129
  const skip = Math.max(options.skip ?? 0, 0);
@@ -163,6 +192,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
163
192
  lastCrawledAt: now,
164
193
  updatedAt: now
165
194
  };
195
+ if (params.ingestionId) {
196
+ $set.ingestionId = params.ingestionId;
197
+ }
166
198
  if (errMsg !== void 0) {
167
199
  $set.errorMessage = errMsg;
168
200
  } else if (params.status === "indexed" && params.doc) {
@@ -175,9 +207,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
175
207
  $set.docId = params.doc.id;
176
208
  } else {
177
209
  $set.modeUsed = params.diag?.modeUsed;
178
- $set.contentLength = null;
179
- $set.title = null;
180
- $set.docId = null;
210
+ $set.contentLength = params.contentLength ?? null;
211
+ $set.title = params.title ?? null;
212
+ $set.docId = params.docId ?? null;
181
213
  }
182
214
  await col.updateOne(
183
215
  {
@@ -254,6 +286,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
254
286
  type: doc.metadata.type,
255
287
  title: doc.metadata.title,
256
288
  url: doc.metadata.url,
289
+ imageUrl: doc.metadata.imageUrl,
290
+ description: doc.metadata.description,
257
291
  score: doc.score
258
292
  }))
259
293
  }
@@ -419,9 +453,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
419
453
  let indexed = 0;
420
454
  const errors = [];
421
455
  const agentId = options?.agentId || "shared";
422
- for (const doc of documents) {
456
+ const onCrawlProgress = options?.metadata?.onCrawlProgress;
457
+ const indexingTotal = documents.length;
458
+ const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
459
+ const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
460
+ let chunksProcessed = 0;
461
+ if (onCrawlProgress && indexingTotal > 0) {
462
+ this.emitCrawlProgress(
463
+ { metadata: options?.metadata },
464
+ {
465
+ phase: "indexing",
466
+ urlsScheduled: indexingTotal,
467
+ pagesProcessed: 0,
468
+ chunksTotal,
469
+ chunksProcessed: 0
470
+ }
471
+ );
472
+ }
473
+ for (let docIndex = 0; docIndex < documents.length; docIndex++) {
474
+ const doc = documents[docIndex];
475
+ const chunks = chunkPlan[docIndex];
423
476
  try {
424
- const chunks = this.chunkContent(doc.content);
425
477
  const isChunked = chunks.length > 1;
426
478
  if (isChunked) {
427
479
  await collection.deleteMany({
@@ -456,6 +508,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
456
508
  },
457
509
  { upsert: true }
458
510
  );
511
+ chunksProcessed++;
512
+ if (onCrawlProgress) {
513
+ this.emitCrawlProgress(
514
+ { metadata: options?.metadata },
515
+ {
516
+ phase: "indexing",
517
+ urlsScheduled: indexingTotal,
518
+ pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
519
+ chunksTotal,
520
+ chunksProcessed
521
+ }
522
+ );
523
+ }
459
524
  }
460
525
  indexed++;
461
526
  } catch (error) {
@@ -535,23 +600,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
535
600
  let deleted = 0;
536
601
  let failed = 0;
537
602
  const errors = [];
603
+ const opsTotal = operations.length;
604
+ let opsDone = 0;
605
+ const ingestOptions = options ?? {};
606
+ this.emitBulkProgress(ingestOptions, {
607
+ phase: "processing",
608
+ opsTotal,
609
+ opsDone: 0
610
+ });
538
611
  for (const op of operations) {
612
+ const currentUrl = bulkOpCurrentUrl(op);
539
613
  try {
540
614
  switch (op.type) {
541
615
  case "insert":
542
616
  if (op.document) {
543
- await this.ingest([op.document], options);
544
- inserted++;
617
+ if (isUrlListingInsert(op.document)) {
618
+ const url = bulkOpCurrentUrl(op);
619
+ const crawlResult = await this.ingestSinglePageFromUrl(
620
+ {
621
+ url,
622
+ metadata: {
623
+ ...op.document.metadata ?? {},
624
+ url
625
+ }
626
+ },
627
+ ingestOptions
628
+ );
629
+ if (crawlResult.indexed > 0) {
630
+ inserted++;
631
+ } else {
632
+ failed++;
633
+ const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
634
+ errors.push({
635
+ id: op.id,
636
+ operation: op.type,
637
+ error: err
638
+ });
639
+ }
640
+ } else {
641
+ await this.ingest([op.document], ingestOptions);
642
+ inserted++;
643
+ }
545
644
  }
546
645
  break;
547
646
  case "update":
548
647
  if (op.document) {
549
- await this.update(op.id, op.document, options);
648
+ await this.update(op.id, op.document, ingestOptions);
550
649
  updated++;
551
650
  }
552
651
  break;
553
652
  case "delete":
554
- const count = await this.delete(op.id, options);
653
+ const count = await this.delete(op.id, ingestOptions);
555
654
  deleted += count;
556
655
  break;
557
656
  }
@@ -562,6 +661,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
562
661
  operation: op.type,
563
662
  error: error.message || "Unknown error"
564
663
  });
664
+ } finally {
665
+ opsDone++;
666
+ this.emitBulkProgress(ingestOptions, {
667
+ phase: "processing",
668
+ opsTotal,
669
+ opsDone,
670
+ currentOpType: op.type,
671
+ ...currentUrl ? { currentUrl } : {}
672
+ });
565
673
  }
566
674
  }
567
675
  return {
@@ -1128,6 +1236,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1128
1236
  };
1129
1237
  }
1130
1238
  const dbg = this.createDebugCollector(config.debug);
1239
+ this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
1131
1240
  const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1132
1241
  if (!base) {
1133
1242
  return {
@@ -1159,6 +1268,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
1159
1268
  if (config.excludePatterns?.length) {
1160
1269
  filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1161
1270
  }
1271
+ this.emitCrawlProgress(config, {
1272
+ phase: "discovering",
1273
+ urlsDiscovered: filteredUrls.length
1274
+ });
1162
1275
  urlsToCrawl = filteredUrls.slice(0, maxPages);
1163
1276
  urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1164
1277
  break;
@@ -1180,7 +1293,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
1180
1293
  urlsToCrawl = discovery.urls;
1181
1294
  urlsSkipped = discovery.skipped;
1182
1295
  dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1296
+ this.emitCrawlProgress(config, {
1297
+ phase: "discovering",
1298
+ urlsDiscovered: urlsToCrawl.length
1299
+ });
1183
1300
  }
1301
+ this.emitCrawlProgress(config, {
1302
+ phase: "crawling",
1303
+ urlsDiscovered: urlsToCrawl.length,
1304
+ urlsScheduled: urlsToCrawl.length
1305
+ });
1184
1306
  const result = await this.crawlUrls(urlsToCrawl, {
1185
1307
  contentSelector: config.contentSelector,
1186
1308
  titleSelector: config.titleSelector,
@@ -1202,9 +1324,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1202
1324
  return {
1203
1325
  ...result,
1204
1326
  urlsSkipped,
1327
+ /** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
1328
+ urlsScheduled: urlsToCrawl.length,
1205
1329
  crawledAt: /* @__PURE__ */ new Date(),
1206
1330
  metadata: {
1207
1331
  ...result.metadata || {},
1332
+ urlsScheduled: urlsToCrawl.length,
1208
1333
  discoveryDebug: dbg.summary()
1209
1334
  }
1210
1335
  };
@@ -1432,6 +1557,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1432
1557
  const forceRecrawl = !!(options && options.forceRecrawl);
1433
1558
  const agentId = options?.agentId ?? "shared";
1434
1559
  const stripQ = config.stripQueryParams ?? false;
1560
+ const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
1435
1561
  const urlByNorm = /* @__PURE__ */ new Map();
1436
1562
  for (const u of urls) {
1437
1563
  const norm = this.normalizeLedgerUrl(u, stripQ) || u;
@@ -1460,6 +1586,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1460
1586
  const results = await Promise.allSettled(
1461
1587
  batch.map(async (url) => {
1462
1588
  const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1589
+ this.emitCrawlPage(config, { url, event: "start" });
1463
1590
  if (ledgerOpts && !forceRecrawl) {
1464
1591
  const entry = await this.findLedgerEntry(urlNormalized, agentId);
1465
1592
  if (this.shouldSkipLedger(
@@ -1480,6 +1607,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
1480
1607
  docId: entry?.docId
1481
1608
  });
1482
1609
  dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1610
+ if (ledgerOpts) {
1611
+ await this.upsertLedgerRecord({
1612
+ url,
1613
+ urlNormalized,
1614
+ agentId,
1615
+ ingestionId,
1616
+ status: "skipped_ledger",
1617
+ title: entry?.title,
1618
+ docId: entry?.docId,
1619
+ contentLength: entry?.contentLength
1620
+ });
1621
+ }
1622
+ this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
1483
1623
  return { kind: "ledger_skip", url };
1484
1624
  }
1485
1625
  }
@@ -1503,6 +1643,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1503
1643
  url,
1504
1644
  urlNormalized,
1505
1645
  agentId,
1646
+ ingestionId,
1506
1647
  status: crawlSt,
1507
1648
  doc,
1508
1649
  diag
@@ -1519,6 +1660,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1519
1660
  docId: doc?.id,
1520
1661
  error: diag?.errorMessage
1521
1662
  });
1663
+ this.emitCrawlPage(config, {
1664
+ url,
1665
+ event: "done",
1666
+ status: crawlSt,
1667
+ error: diag?.errorMessage
1668
+ });
1522
1669
  return { kind: "doc", doc, url };
1523
1670
  } catch (error) {
1524
1671
  const msg = error instanceof Error ? error.message : String(error);
@@ -1527,6 +1674,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1527
1674
  url,
1528
1675
  urlNormalized,
1529
1676
  agentId,
1677
+ ingestionId,
1530
1678
  status: "error",
1531
1679
  errorMessage: msg
1532
1680
  });
@@ -1537,6 +1685,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1537
1685
  status: "error",
1538
1686
  error: msg
1539
1687
  });
1688
+ this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
1540
1689
  throw { url, error };
1541
1690
  }
1542
1691
  })
@@ -1559,12 +1708,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
1559
1708
  });
1560
1709
  }
1561
1710
  }
1711
+ this.emitCrawlProgress(config, {
1712
+ phase: "crawling",
1713
+ urlsScheduled: uniqueUrls.length,
1714
+ pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
1715
+ });
1562
1716
  if (i + concurrency < uniqueUrls.length) {
1563
1717
  await this.delay(delayMs);
1564
1718
  }
1565
1719
  }
1566
1720
  if (documents.length > 0) {
1567
- const ingestResult = await this.ingest(documents, options);
1721
+ const ingestResult = await this.ingest(documents, {
1722
+ ...options,
1723
+ metadata: {
1724
+ ...options?.metadata ?? {},
1725
+ onCrawlProgress: config.metadata?.onCrawlProgress
1726
+ }
1727
+ });
1568
1728
  indexed = ingestResult.indexed;
1569
1729
  if (ingestResult.errors) {
1570
1730
  errors.push(...ingestResult.errors);
@@ -1659,7 +1819,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1659
1819
  const content = this.extractBestContentText($, config);
1660
1820
  const minChars = config.minExtractedContentLength ?? 50;
1661
1821
  if (!content || content.length < minChars) return null;
1662
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
1822
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1823
+ this.extractHeroImage($, url) || void 0;
1663
1824
  let imageUrl;
1664
1825
  if (image) {
1665
1826
  try {
@@ -1692,6 +1853,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
1692
1853
  }
1693
1854
  };
1694
1855
  }
1856
+ /**
1857
+ * Fallback image extraction: finds the first meaningful image in the content area.
1858
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
1859
+ */
1860
+ extractHeroImage($, pageUrl) {
1861
+ const containers = $('main, article, [role="main"], #content, .content');
1862
+ const scope = containers.length > 0 ? containers : $("body");
1863
+ let best;
1864
+ scope.find("img[src]").each((_, el) => {
1865
+ if (best) return false;
1866
+ const src = $(el).attr("src") || "";
1867
+ const alt = ($(el).attr("alt") || "").toLowerCase();
1868
+ const width = parseInt($(el).attr("width") || "0", 10);
1869
+ const height = parseInt($(el).attr("height") || "0", 10);
1870
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
1871
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1872
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
1873
+ if (src.includes("/_next/image")) {
1874
+ try {
1875
+ const nextUrl = new URL(src, pageUrl);
1876
+ const realUrl = nextUrl.searchParams.get("url");
1877
+ if (realUrl) {
1878
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1879
+ return false;
1880
+ }
1881
+ } catch {
1882
+ }
1883
+ }
1884
+ best = src;
1885
+ return false;
1886
+ });
1887
+ return best;
1888
+ }
1695
1889
  looksLikeDynamicShell(html) {
1696
1890
  const lower = html.toLowerCase();
1697
1891
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1921,6 +2115,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
1921
2115
  }
1922
2116
  return Array.from(found);
1923
2117
  }
2118
+ emitBulkProgress(options, update) {
2119
+ const fn = options?.metadata?.onBulkProgress;
2120
+ if (!fn) return;
2121
+ try {
2122
+ fn(update);
2123
+ } catch {
2124
+ }
2125
+ }
2126
+ emitCrawlProgress(config, update) {
2127
+ const fn = config.metadata?.onCrawlProgress;
2128
+ if (!fn) return;
2129
+ try {
2130
+ fn(update);
2131
+ } catch {
2132
+ }
2133
+ }
2134
+ emitCrawlPage(config, event) {
2135
+ const fn = config.metadata?.onCrawlPage;
2136
+ if (!fn) return;
2137
+ try {
2138
+ fn(event);
2139
+ } catch {
2140
+ }
2141
+ }
1924
2142
  createDebugCollector(debug) {
1925
2143
  const enabled = !!debug?.enabled;
1926
2144
  const level = debug?.level || "summary";
@@ -2209,6 +2427,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2209
2427
  filterableFields: this.config.filterableFields,
2210
2428
  typeBoosts: this.config.typeBoosts,
2211
2429
  recencyBoost: this.config.recencyBoost,
2430
+ crawlLedger: this.config.crawlLedger,
2212
2431
  priority: this.priority
2213
2432
  };
2214
2433
  }
package/dist/index.mjs CHANGED
@@ -4,6 +4,24 @@ import OpenAI from "openai";
4
4
  import * as cheerio from "cheerio";
5
5
  import * as fs from "fs";
6
6
  import * as path from "path";
7
+ function bulkOpCurrentUrl(op) {
8
+ const meta = op.document?.metadata;
9
+ if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
10
+ if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
11
+ return void 0;
12
+ }
13
+ function isUrlListingInsert(document) {
14
+ const meta = document.metadata;
15
+ if (meta?.type !== "url") return false;
16
+ const url = typeof meta.url === "string" ? meta.url.trim() : "";
17
+ if (!url) return false;
18
+ try {
19
+ const parsed = new URL(url);
20
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
21
+ } catch {
22
+ return false;
23
+ }
24
+ }
7
25
  var WebRAGPlugin = class _WebRAGPlugin {
8
26
  name = "web-rag";
9
27
  type = "rag";
@@ -42,6 +60,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
42
60
  }
43
61
  return this.db.collection(this.config.collection);
44
62
  }
63
+ ledgerIndexesEnsured = false;
45
64
  async getLedgerCollection() {
46
65
  if (!this.client) {
47
66
  this.client = new MongoClient(this.config.mongoUri);
@@ -49,7 +68,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
49
68
  this.db = this.client.db(this.config.dbName);
50
69
  }
51
70
  const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
52
- return this.db.collection(name);
71
+ const col = this.db.collection(name);
72
+ if (!this.ledgerIndexesEnsured) {
73
+ this.ledgerIndexesEnsured = true;
74
+ await col.createIndex(
75
+ { tenantId: 1, agentId: 1, urlNormalized: 1 },
76
+ { unique: true }
77
+ );
78
+ await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
79
+ }
80
+ return col;
53
81
  }
54
82
  /**
55
83
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -59,6 +87,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
59
87
  const filter = { tenantId: this.config.tenantId };
60
88
  filter.agentId = options.agentId ?? "shared";
61
89
  if (options.domain) filter.domain = options.domain;
90
+ if (options.ingestionId) filter.ingestionId = options.ingestionId;
62
91
  if (options.status) filter.lastStatus = options.status;
63
92
  const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
64
93
  const skip = Math.max(options.skip ?? 0, 0);
@@ -127,6 +156,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
127
156
  lastCrawledAt: now,
128
157
  updatedAt: now
129
158
  };
159
+ if (params.ingestionId) {
160
+ $set.ingestionId = params.ingestionId;
161
+ }
130
162
  if (errMsg !== void 0) {
131
163
  $set.errorMessage = errMsg;
132
164
  } else if (params.status === "indexed" && params.doc) {
@@ -139,9 +171,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
139
171
  $set.docId = params.doc.id;
140
172
  } else {
141
173
  $set.modeUsed = params.diag?.modeUsed;
142
- $set.contentLength = null;
143
- $set.title = null;
144
- $set.docId = null;
174
+ $set.contentLength = params.contentLength ?? null;
175
+ $set.title = params.title ?? null;
176
+ $set.docId = params.docId ?? null;
145
177
  }
146
178
  await col.updateOne(
147
179
  {
@@ -218,6 +250,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
218
250
  type: doc.metadata.type,
219
251
  title: doc.metadata.title,
220
252
  url: doc.metadata.url,
253
+ imageUrl: doc.metadata.imageUrl,
254
+ description: doc.metadata.description,
221
255
  score: doc.score
222
256
  }))
223
257
  }
@@ -383,9 +417,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
383
417
  let indexed = 0;
384
418
  const errors = [];
385
419
  const agentId = options?.agentId || "shared";
386
- for (const doc of documents) {
420
+ const onCrawlProgress = options?.metadata?.onCrawlProgress;
421
+ const indexingTotal = documents.length;
422
+ const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
423
+ const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
424
+ let chunksProcessed = 0;
425
+ if (onCrawlProgress && indexingTotal > 0) {
426
+ this.emitCrawlProgress(
427
+ { metadata: options?.metadata },
428
+ {
429
+ phase: "indexing",
430
+ urlsScheduled: indexingTotal,
431
+ pagesProcessed: 0,
432
+ chunksTotal,
433
+ chunksProcessed: 0
434
+ }
435
+ );
436
+ }
437
+ for (let docIndex = 0; docIndex < documents.length; docIndex++) {
438
+ const doc = documents[docIndex];
439
+ const chunks = chunkPlan[docIndex];
387
440
  try {
388
- const chunks = this.chunkContent(doc.content);
389
441
  const isChunked = chunks.length > 1;
390
442
  if (isChunked) {
391
443
  await collection.deleteMany({
@@ -420,6 +472,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
420
472
  },
421
473
  { upsert: true }
422
474
  );
475
+ chunksProcessed++;
476
+ if (onCrawlProgress) {
477
+ this.emitCrawlProgress(
478
+ { metadata: options?.metadata },
479
+ {
480
+ phase: "indexing",
481
+ urlsScheduled: indexingTotal,
482
+ pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
483
+ chunksTotal,
484
+ chunksProcessed
485
+ }
486
+ );
487
+ }
423
488
  }
424
489
  indexed++;
425
490
  } catch (error) {
@@ -499,23 +564,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
499
564
  let deleted = 0;
500
565
  let failed = 0;
501
566
  const errors = [];
567
+ const opsTotal = operations.length;
568
+ let opsDone = 0;
569
+ const ingestOptions = options ?? {};
570
+ this.emitBulkProgress(ingestOptions, {
571
+ phase: "processing",
572
+ opsTotal,
573
+ opsDone: 0
574
+ });
502
575
  for (const op of operations) {
576
+ const currentUrl = bulkOpCurrentUrl(op);
503
577
  try {
504
578
  switch (op.type) {
505
579
  case "insert":
506
580
  if (op.document) {
507
- await this.ingest([op.document], options);
508
- inserted++;
581
+ if (isUrlListingInsert(op.document)) {
582
+ const url = bulkOpCurrentUrl(op);
583
+ const crawlResult = await this.ingestSinglePageFromUrl(
584
+ {
585
+ url,
586
+ metadata: {
587
+ ...op.document.metadata ?? {},
588
+ url
589
+ }
590
+ },
591
+ ingestOptions
592
+ );
593
+ if (crawlResult.indexed > 0) {
594
+ inserted++;
595
+ } else {
596
+ failed++;
597
+ const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
598
+ errors.push({
599
+ id: op.id,
600
+ operation: op.type,
601
+ error: err
602
+ });
603
+ }
604
+ } else {
605
+ await this.ingest([op.document], ingestOptions);
606
+ inserted++;
607
+ }
509
608
  }
510
609
  break;
511
610
  case "update":
512
611
  if (op.document) {
513
- await this.update(op.id, op.document, options);
612
+ await this.update(op.id, op.document, ingestOptions);
514
613
  updated++;
515
614
  }
516
615
  break;
517
616
  case "delete":
518
- const count = await this.delete(op.id, options);
617
+ const count = await this.delete(op.id, ingestOptions);
519
618
  deleted += count;
520
619
  break;
521
620
  }
@@ -526,6 +625,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
526
625
  operation: op.type,
527
626
  error: error.message || "Unknown error"
528
627
  });
628
+ } finally {
629
+ opsDone++;
630
+ this.emitBulkProgress(ingestOptions, {
631
+ phase: "processing",
632
+ opsTotal,
633
+ opsDone,
634
+ currentOpType: op.type,
635
+ ...currentUrl ? { currentUrl } : {}
636
+ });
529
637
  }
530
638
  }
531
639
  return {
@@ -1092,6 +1200,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1092
1200
  };
1093
1201
  }
1094
1202
  const dbg = this.createDebugCollector(config.debug);
1203
+ this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
1095
1204
  const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1096
1205
  if (!base) {
1097
1206
  return {
@@ -1123,6 +1232,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
1123
1232
  if (config.excludePatterns?.length) {
1124
1233
  filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1125
1234
  }
1235
+ this.emitCrawlProgress(config, {
1236
+ phase: "discovering",
1237
+ urlsDiscovered: filteredUrls.length
1238
+ });
1126
1239
  urlsToCrawl = filteredUrls.slice(0, maxPages);
1127
1240
  urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1128
1241
  break;
@@ -1144,7 +1257,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
1144
1257
  urlsToCrawl = discovery.urls;
1145
1258
  urlsSkipped = discovery.skipped;
1146
1259
  dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1260
+ this.emitCrawlProgress(config, {
1261
+ phase: "discovering",
1262
+ urlsDiscovered: urlsToCrawl.length
1263
+ });
1147
1264
  }
1265
+ this.emitCrawlProgress(config, {
1266
+ phase: "crawling",
1267
+ urlsDiscovered: urlsToCrawl.length,
1268
+ urlsScheduled: urlsToCrawl.length
1269
+ });
1148
1270
  const result = await this.crawlUrls(urlsToCrawl, {
1149
1271
  contentSelector: config.contentSelector,
1150
1272
  titleSelector: config.titleSelector,
@@ -1166,9 +1288,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1166
1288
  return {
1167
1289
  ...result,
1168
1290
  urlsSkipped,
1291
+ /** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
1292
+ urlsScheduled: urlsToCrawl.length,
1169
1293
  crawledAt: /* @__PURE__ */ new Date(),
1170
1294
  metadata: {
1171
1295
  ...result.metadata || {},
1296
+ urlsScheduled: urlsToCrawl.length,
1172
1297
  discoveryDebug: dbg.summary()
1173
1298
  }
1174
1299
  };
@@ -1396,6 +1521,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1396
1521
  const forceRecrawl = !!(options && options.forceRecrawl);
1397
1522
  const agentId = options?.agentId ?? "shared";
1398
1523
  const stripQ = config.stripQueryParams ?? false;
1524
+ const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
1399
1525
  const urlByNorm = /* @__PURE__ */ new Map();
1400
1526
  for (const u of urls) {
1401
1527
  const norm = this.normalizeLedgerUrl(u, stripQ) || u;
@@ -1424,6 +1550,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1424
1550
  const results = await Promise.allSettled(
1425
1551
  batch.map(async (url) => {
1426
1552
  const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1553
+ this.emitCrawlPage(config, { url, event: "start" });
1427
1554
  if (ledgerOpts && !forceRecrawl) {
1428
1555
  const entry = await this.findLedgerEntry(urlNormalized, agentId);
1429
1556
  if (this.shouldSkipLedger(
@@ -1444,6 +1571,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
1444
1571
  docId: entry?.docId
1445
1572
  });
1446
1573
  dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1574
+ if (ledgerOpts) {
1575
+ await this.upsertLedgerRecord({
1576
+ url,
1577
+ urlNormalized,
1578
+ agentId,
1579
+ ingestionId,
1580
+ status: "skipped_ledger",
1581
+ title: entry?.title,
1582
+ docId: entry?.docId,
1583
+ contentLength: entry?.contentLength
1584
+ });
1585
+ }
1586
+ this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
1447
1587
  return { kind: "ledger_skip", url };
1448
1588
  }
1449
1589
  }
@@ -1467,6 +1607,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1467
1607
  url,
1468
1608
  urlNormalized,
1469
1609
  agentId,
1610
+ ingestionId,
1470
1611
  status: crawlSt,
1471
1612
  doc,
1472
1613
  diag
@@ -1483,6 +1624,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1483
1624
  docId: doc?.id,
1484
1625
  error: diag?.errorMessage
1485
1626
  });
1627
+ this.emitCrawlPage(config, {
1628
+ url,
1629
+ event: "done",
1630
+ status: crawlSt,
1631
+ error: diag?.errorMessage
1632
+ });
1486
1633
  return { kind: "doc", doc, url };
1487
1634
  } catch (error) {
1488
1635
  const msg = error instanceof Error ? error.message : String(error);
@@ -1491,6 +1638,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1491
1638
  url,
1492
1639
  urlNormalized,
1493
1640
  agentId,
1641
+ ingestionId,
1494
1642
  status: "error",
1495
1643
  errorMessage: msg
1496
1644
  });
@@ -1501,6 +1649,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1501
1649
  status: "error",
1502
1650
  error: msg
1503
1651
  });
1652
+ this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
1504
1653
  throw { url, error };
1505
1654
  }
1506
1655
  })
@@ -1523,12 +1672,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
1523
1672
  });
1524
1673
  }
1525
1674
  }
1675
+ this.emitCrawlProgress(config, {
1676
+ phase: "crawling",
1677
+ urlsScheduled: uniqueUrls.length,
1678
+ pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
1679
+ });
1526
1680
  if (i + concurrency < uniqueUrls.length) {
1527
1681
  await this.delay(delayMs);
1528
1682
  }
1529
1683
  }
1530
1684
  if (documents.length > 0) {
1531
- const ingestResult = await this.ingest(documents, options);
1685
+ const ingestResult = await this.ingest(documents, {
1686
+ ...options,
1687
+ metadata: {
1688
+ ...options?.metadata ?? {},
1689
+ onCrawlProgress: config.metadata?.onCrawlProgress
1690
+ }
1691
+ });
1532
1692
  indexed = ingestResult.indexed;
1533
1693
  if (ingestResult.errors) {
1534
1694
  errors.push(...ingestResult.errors);
@@ -1623,7 +1783,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1623
1783
  const content = this.extractBestContentText($, config);
1624
1784
  const minChars = config.minExtractedContentLength ?? 50;
1625
1785
  if (!content || content.length < minChars) return null;
1626
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
1786
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1787
+ this.extractHeroImage($, url) || void 0;
1627
1788
  let imageUrl;
1628
1789
  if (image) {
1629
1790
  try {
@@ -1656,6 +1817,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
1656
1817
  }
1657
1818
  };
1658
1819
  }
1820
+ /**
1821
+ * Fallback image extraction: finds the first meaningful image in the content area.
1822
+ * Skips icons, avatars, and tiny assets by filtering on common patterns.
1823
+ */
1824
+ extractHeroImage($, pageUrl) {
1825
+ const containers = $('main, article, [role="main"], #content, .content');
1826
+ const scope = containers.length > 0 ? containers : $("body");
1827
+ let best;
1828
+ scope.find("img[src]").each((_, el) => {
1829
+ if (best) return false;
1830
+ const src = $(el).attr("src") || "";
1831
+ const alt = ($(el).attr("alt") || "").toLowerCase();
1832
+ const width = parseInt($(el).attr("width") || "0", 10);
1833
+ const height = parseInt($(el).attr("height") || "0", 10);
1834
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
1835
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1836
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
1837
+ if (src.includes("/_next/image")) {
1838
+ try {
1839
+ const nextUrl = new URL(src, pageUrl);
1840
+ const realUrl = nextUrl.searchParams.get("url");
1841
+ if (realUrl) {
1842
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1843
+ return false;
1844
+ }
1845
+ } catch {
1846
+ }
1847
+ }
1848
+ best = src;
1849
+ return false;
1850
+ });
1851
+ return best;
1852
+ }
1659
1853
  looksLikeDynamicShell(html) {
1660
1854
  const lower = html.toLowerCase();
1661
1855
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1885,6 +2079,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
1885
2079
  }
1886
2080
  return Array.from(found);
1887
2081
  }
2082
+ emitBulkProgress(options, update) {
2083
+ const fn = options?.metadata?.onBulkProgress;
2084
+ if (!fn) return;
2085
+ try {
2086
+ fn(update);
2087
+ } catch {
2088
+ }
2089
+ }
2090
+ emitCrawlProgress(config, update) {
2091
+ const fn = config.metadata?.onCrawlProgress;
2092
+ if (!fn) return;
2093
+ try {
2094
+ fn(update);
2095
+ } catch {
2096
+ }
2097
+ }
2098
+ emitCrawlPage(config, event) {
2099
+ const fn = config.metadata?.onCrawlPage;
2100
+ if (!fn) return;
2101
+ try {
2102
+ fn(event);
2103
+ } catch {
2104
+ }
2105
+ }
1888
2106
  createDebugCollector(debug) {
1889
2107
  const enabled = !!debug?.enabled;
1890
2108
  const level = debug?.level || "summary";
@@ -2173,6 +2391,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2173
2391
  filterableFields: this.config.filterableFields,
2174
2392
  typeBoosts: this.config.typeBoosts,
2175
2393
  recencyBoost: this.config.recencyBoost,
2394
+ crawlLedger: this.config.crawlLedger,
2176
2395
  priority: this.priority
2177
2396
  };
2178
2397
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.3",
3
+ "version": "0.1.5",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",