@snap-agent/rag-web 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
118
118
  interface CrawlLedgerDocument {
119
119
  tenantId: string;
120
120
  agentId: string;
121
+ /** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
122
+ ingestionId?: string;
121
123
  urlNormalized: string;
122
124
  url: string;
123
125
  domain: string;
@@ -369,10 +371,46 @@ interface RSSConfig {
369
371
  /**
370
372
  * Crawl result for sitemap/URL crawling
371
373
  */
374
+ type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
375
+ /** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
376
+ interface CrawlProgressUpdate {
377
+ phase: CrawlProgressPhase;
378
+ /** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
379
+ urlsDiscovered?: number;
380
+ /** URLs that will be crawled in this run (≤ maxPages). */
381
+ urlsScheduled?: number;
382
+ /** During crawl: batches done. During indexing: documents fully embedded so far. */
383
+ pagesProcessed?: number;
384
+ /** During indexing: total text chunks to embed (drives web_content writes). */
385
+ chunksTotal?: number;
386
+ /** During indexing: chunks embedded so far. */
387
+ chunksProcessed?: number;
388
+ }
389
+ type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
390
+ type BulkProgressPhase = 'processing' | 'indexing';
391
+ /** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
392
+ interface BulkProgressUpdate {
393
+ phase: BulkProgressPhase;
394
+ opsTotal: number;
395
+ opsDone: number;
396
+ currentOpType?: 'insert' | 'update' | 'delete';
397
+ currentUrl?: string;
398
+ }
399
+ type BulkProgressCallback = (update: BulkProgressUpdate) => void;
400
+ /** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
401
+ type CrawlPageEvent = {
402
+ url: string;
403
+ event: 'start' | 'done';
404
+ status?: string;
405
+ error?: string;
406
+ };
407
+ type CrawlPageCallback = (event: CrawlPageEvent) => void;
372
408
  interface CrawlResult extends WebIngestResult {
373
409
  urlsCrawled: number;
374
410
  urlsSkipped: number;
375
411
  urlsFailed: number;
412
+ /** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
413
+ urlsScheduled?: number;
376
414
  crawledAt: Date;
377
415
  }
378
416
  /**
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
423
461
  private cacheStats;
424
462
  constructor(config: WebRAGConfig);
425
463
  private getCollection;
464
+ private ledgerIndexesEnsured;
426
465
  private getLedgerCollection;
427
466
  /**
428
467
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
430
469
  listCrawlLedger(options?: {
431
470
  agentId?: string;
432
471
  domain?: string;
472
+ ingestionId?: string;
433
473
  status?: CrawlLedgerStatus;
434
474
  limit?: number;
435
475
  skip?: number;
@@ -670,6 +710,9 @@ declare class WebRAGPlugin implements RAGPlugin {
670
710
  private crawlPageSmart;
671
711
  private crawlPageRendered;
672
712
  private discoverSitemaps;
713
+ private emitBulkProgress;
714
+ private emitCrawlProgress;
715
+ private emitCrawlPage;
673
716
  private createDebugCollector;
674
717
  /**
675
718
  * Clean extracted text content
@@ -740,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
740
783
  getConfig(): Record<string, any>;
741
784
  }
742
785
 
743
- export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
786
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
package/dist/index.d.ts CHANGED
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
118
118
  interface CrawlLedgerDocument {
119
119
  tenantId: string;
120
120
  agentId: string;
121
+ /** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
122
+ ingestionId?: string;
121
123
  urlNormalized: string;
122
124
  url: string;
123
125
  domain: string;
@@ -369,10 +371,46 @@ interface RSSConfig {
369
371
  /**
370
372
  * Crawl result for sitemap/URL crawling
371
373
  */
374
+ type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
375
+ /** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
376
+ interface CrawlProgressUpdate {
377
+ phase: CrawlProgressPhase;
378
+ /** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
379
+ urlsDiscovered?: number;
380
+ /** URLs that will be crawled in this run (≤ maxPages). */
381
+ urlsScheduled?: number;
382
+ /** During crawl: batches done. During indexing: documents fully embedded so far. */
383
+ pagesProcessed?: number;
384
+ /** During indexing: total text chunks to embed (drives web_content writes). */
385
+ chunksTotal?: number;
386
+ /** During indexing: chunks embedded so far. */
387
+ chunksProcessed?: number;
388
+ }
389
+ type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
390
+ type BulkProgressPhase = 'processing' | 'indexing';
391
+ /** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
392
+ interface BulkProgressUpdate {
393
+ phase: BulkProgressPhase;
394
+ opsTotal: number;
395
+ opsDone: number;
396
+ currentOpType?: 'insert' | 'update' | 'delete';
397
+ currentUrl?: string;
398
+ }
399
+ type BulkProgressCallback = (update: BulkProgressUpdate) => void;
400
+ /** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
401
+ type CrawlPageEvent = {
402
+ url: string;
403
+ event: 'start' | 'done';
404
+ status?: string;
405
+ error?: string;
406
+ };
407
+ type CrawlPageCallback = (event: CrawlPageEvent) => void;
372
408
  interface CrawlResult extends WebIngestResult {
373
409
  urlsCrawled: number;
374
410
  urlsSkipped: number;
375
411
  urlsFailed: number;
412
+ /** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
413
+ urlsScheduled?: number;
376
414
  crawledAt: Date;
377
415
  }
378
416
  /**
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
423
461
  private cacheStats;
424
462
  constructor(config: WebRAGConfig);
425
463
  private getCollection;
464
+ private ledgerIndexesEnsured;
426
465
  private getLedgerCollection;
427
466
  /**
428
467
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
430
469
  listCrawlLedger(options?: {
431
470
  agentId?: string;
432
471
  domain?: string;
472
+ ingestionId?: string;
433
473
  status?: CrawlLedgerStatus;
434
474
  limit?: number;
435
475
  skip?: number;
@@ -670,6 +710,9 @@ declare class WebRAGPlugin implements RAGPlugin {
670
710
  private crawlPageSmart;
671
711
  private crawlPageRendered;
672
712
  private discoverSitemaps;
713
+ private emitBulkProgress;
714
+ private emitCrawlProgress;
715
+ private emitCrawlPage;
673
716
  private createDebugCollector;
674
717
  /**
675
718
  * Clean extracted text content
@@ -740,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
740
783
  getConfig(): Record<string, any>;
741
784
  }
742
785
 
743
- export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
786
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
package/dist/index.js CHANGED
@@ -40,6 +40,24 @@ var import_openai = __toESM(require("openai"));
40
40
  var cheerio = __toESM(require("cheerio"));
41
41
  var fs = __toESM(require("fs"));
42
42
  var path = __toESM(require("path"));
43
+ function bulkOpCurrentUrl(op) {
44
+ const meta = op.document?.metadata;
45
+ if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
46
+ if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
47
+ return void 0;
48
+ }
49
+ function isUrlListingInsert(document) {
50
+ const meta = document.metadata;
51
+ if (meta?.type !== "url") return false;
52
+ const url = typeof meta.url === "string" ? meta.url.trim() : "";
53
+ if (!url) return false;
54
+ try {
55
+ const parsed = new URL(url);
56
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
57
+ } catch {
58
+ return false;
59
+ }
60
+ }
43
61
  var WebRAGPlugin = class _WebRAGPlugin {
44
62
  name = "web-rag";
45
63
  type = "rag";
@@ -78,6 +96,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
78
96
  }
79
97
  return this.db.collection(this.config.collection);
80
98
  }
99
+ ledgerIndexesEnsured = false;
81
100
  async getLedgerCollection() {
82
101
  if (!this.client) {
83
102
  this.client = new import_mongodb.MongoClient(this.config.mongoUri);
@@ -85,7 +104,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
85
104
  this.db = this.client.db(this.config.dbName);
86
105
  }
87
106
  const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
88
- return this.db.collection(name);
107
+ const col = this.db.collection(name);
108
+ if (!this.ledgerIndexesEnsured) {
109
+ this.ledgerIndexesEnsured = true;
110
+ await col.createIndex(
111
+ { tenantId: 1, agentId: 1, urlNormalized: 1 },
112
+ { unique: true }
113
+ );
114
+ await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
115
+ }
116
+ return col;
89
117
  }
90
118
  /**
91
119
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -95,6 +123,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
95
123
  const filter = { tenantId: this.config.tenantId };
96
124
  filter.agentId = options.agentId ?? "shared";
97
125
  if (options.domain) filter.domain = options.domain;
126
+ if (options.ingestionId) filter.ingestionId = options.ingestionId;
98
127
  if (options.status) filter.lastStatus = options.status;
99
128
  const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
100
129
  const skip = Math.max(options.skip ?? 0, 0);
@@ -163,6 +192,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
163
192
  lastCrawledAt: now,
164
193
  updatedAt: now
165
194
  };
195
+ if (params.ingestionId) {
196
+ $set.ingestionId = params.ingestionId;
197
+ }
166
198
  if (errMsg !== void 0) {
167
199
  $set.errorMessage = errMsg;
168
200
  } else if (params.status === "indexed" && params.doc) {
@@ -175,9 +207,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
175
207
  $set.docId = params.doc.id;
176
208
  } else {
177
209
  $set.modeUsed = params.diag?.modeUsed;
178
- $set.contentLength = null;
179
- $set.title = null;
180
- $set.docId = null;
210
+ $set.contentLength = params.contentLength ?? null;
211
+ $set.title = params.title ?? null;
212
+ $set.docId = params.docId ?? null;
181
213
  }
182
214
  await col.updateOne(
183
215
  {
@@ -421,9 +453,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
421
453
  let indexed = 0;
422
454
  const errors = [];
423
455
  const agentId = options?.agentId || "shared";
424
- for (const doc of documents) {
456
+ const onCrawlProgress = options?.metadata?.onCrawlProgress;
457
+ const indexingTotal = documents.length;
458
+ const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
459
+ const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
460
+ let chunksProcessed = 0;
461
+ if (onCrawlProgress && indexingTotal > 0) {
462
+ this.emitCrawlProgress(
463
+ { metadata: options?.metadata },
464
+ {
465
+ phase: "indexing",
466
+ urlsScheduled: indexingTotal,
467
+ pagesProcessed: 0,
468
+ chunksTotal,
469
+ chunksProcessed: 0
470
+ }
471
+ );
472
+ }
473
+ for (let docIndex = 0; docIndex < documents.length; docIndex++) {
474
+ const doc = documents[docIndex];
475
+ const chunks = chunkPlan[docIndex];
425
476
  try {
426
- const chunks = this.chunkContent(doc.content);
427
477
  const isChunked = chunks.length > 1;
428
478
  if (isChunked) {
429
479
  await collection.deleteMany({
@@ -458,6 +508,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
458
508
  },
459
509
  { upsert: true }
460
510
  );
511
+ chunksProcessed++;
512
+ if (onCrawlProgress) {
513
+ this.emitCrawlProgress(
514
+ { metadata: options?.metadata },
515
+ {
516
+ phase: "indexing",
517
+ urlsScheduled: indexingTotal,
518
+ pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
519
+ chunksTotal,
520
+ chunksProcessed
521
+ }
522
+ );
523
+ }
461
524
  }
462
525
  indexed++;
463
526
  } catch (error) {
@@ -537,23 +600,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
537
600
  let deleted = 0;
538
601
  let failed = 0;
539
602
  const errors = [];
603
+ const opsTotal = operations.length;
604
+ let opsDone = 0;
605
+ const ingestOptions = options ?? {};
606
+ this.emitBulkProgress(ingestOptions, {
607
+ phase: "processing",
608
+ opsTotal,
609
+ opsDone: 0
610
+ });
540
611
  for (const op of operations) {
612
+ const currentUrl = bulkOpCurrentUrl(op);
541
613
  try {
542
614
  switch (op.type) {
543
615
  case "insert":
544
616
  if (op.document) {
545
- await this.ingest([op.document], options);
546
- inserted++;
617
+ if (isUrlListingInsert(op.document)) {
618
+ const url = bulkOpCurrentUrl(op);
619
+ const crawlResult = await this.ingestSinglePageFromUrl(
620
+ {
621
+ url,
622
+ metadata: {
623
+ ...op.document.metadata ?? {},
624
+ url
625
+ }
626
+ },
627
+ ingestOptions
628
+ );
629
+ if (crawlResult.indexed > 0) {
630
+ inserted++;
631
+ } else {
632
+ failed++;
633
+ const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
634
+ errors.push({
635
+ id: op.id,
636
+ operation: op.type,
637
+ error: err
638
+ });
639
+ }
640
+ } else {
641
+ await this.ingest([op.document], ingestOptions);
642
+ inserted++;
643
+ }
547
644
  }
548
645
  break;
549
646
  case "update":
550
647
  if (op.document) {
551
- await this.update(op.id, op.document, options);
648
+ await this.update(op.id, op.document, ingestOptions);
552
649
  updated++;
553
650
  }
554
651
  break;
555
652
  case "delete":
556
- const count = await this.delete(op.id, options);
653
+ const count = await this.delete(op.id, ingestOptions);
557
654
  deleted += count;
558
655
  break;
559
656
  }
@@ -564,6 +661,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
564
661
  operation: op.type,
565
662
  error: error.message || "Unknown error"
566
663
  });
664
+ } finally {
665
+ opsDone++;
666
+ this.emitBulkProgress(ingestOptions, {
667
+ phase: "processing",
668
+ opsTotal,
669
+ opsDone,
670
+ currentOpType: op.type,
671
+ ...currentUrl ? { currentUrl } : {}
672
+ });
567
673
  }
568
674
  }
569
675
  return {
@@ -1130,6 +1236,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1130
1236
  };
1131
1237
  }
1132
1238
  const dbg = this.createDebugCollector(config.debug);
1239
+ this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
1133
1240
  const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1134
1241
  if (!base) {
1135
1242
  return {
@@ -1161,6 +1268,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
1161
1268
  if (config.excludePatterns?.length) {
1162
1269
  filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1163
1270
  }
1271
+ this.emitCrawlProgress(config, {
1272
+ phase: "discovering",
1273
+ urlsDiscovered: filteredUrls.length
1274
+ });
1164
1275
  urlsToCrawl = filteredUrls.slice(0, maxPages);
1165
1276
  urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1166
1277
  break;
@@ -1182,7 +1293,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
1182
1293
  urlsToCrawl = discovery.urls;
1183
1294
  urlsSkipped = discovery.skipped;
1184
1295
  dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1296
+ this.emitCrawlProgress(config, {
1297
+ phase: "discovering",
1298
+ urlsDiscovered: urlsToCrawl.length
1299
+ });
1185
1300
  }
1301
+ this.emitCrawlProgress(config, {
1302
+ phase: "crawling",
1303
+ urlsDiscovered: urlsToCrawl.length,
1304
+ urlsScheduled: urlsToCrawl.length
1305
+ });
1186
1306
  const result = await this.crawlUrls(urlsToCrawl, {
1187
1307
  contentSelector: config.contentSelector,
1188
1308
  titleSelector: config.titleSelector,
@@ -1204,9 +1324,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1204
1324
  return {
1205
1325
  ...result,
1206
1326
  urlsSkipped,
1327
+ /** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
1328
+ urlsScheduled: urlsToCrawl.length,
1207
1329
  crawledAt: /* @__PURE__ */ new Date(),
1208
1330
  metadata: {
1209
1331
  ...result.metadata || {},
1332
+ urlsScheduled: urlsToCrawl.length,
1210
1333
  discoveryDebug: dbg.summary()
1211
1334
  }
1212
1335
  };
@@ -1434,6 +1557,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1434
1557
  const forceRecrawl = !!(options && options.forceRecrawl);
1435
1558
  const agentId = options?.agentId ?? "shared";
1436
1559
  const stripQ = config.stripQueryParams ?? false;
1560
+ const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
1437
1561
  const urlByNorm = /* @__PURE__ */ new Map();
1438
1562
  for (const u of urls) {
1439
1563
  const norm = this.normalizeLedgerUrl(u, stripQ) || u;
@@ -1462,6 +1586,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1462
1586
  const results = await Promise.allSettled(
1463
1587
  batch.map(async (url) => {
1464
1588
  const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1589
+ this.emitCrawlPage(config, { url, event: "start" });
1465
1590
  if (ledgerOpts && !forceRecrawl) {
1466
1591
  const entry = await this.findLedgerEntry(urlNormalized, agentId);
1467
1592
  if (this.shouldSkipLedger(
@@ -1482,6 +1607,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
1482
1607
  docId: entry?.docId
1483
1608
  });
1484
1609
  dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1610
+ if (ledgerOpts) {
1611
+ await this.upsertLedgerRecord({
1612
+ url,
1613
+ urlNormalized,
1614
+ agentId,
1615
+ ingestionId,
1616
+ status: "skipped_ledger",
1617
+ title: entry?.title,
1618
+ docId: entry?.docId,
1619
+ contentLength: entry?.contentLength
1620
+ });
1621
+ }
1622
+ this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
1485
1623
  return { kind: "ledger_skip", url };
1486
1624
  }
1487
1625
  }
@@ -1505,6 +1643,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1505
1643
  url,
1506
1644
  urlNormalized,
1507
1645
  agentId,
1646
+ ingestionId,
1508
1647
  status: crawlSt,
1509
1648
  doc,
1510
1649
  diag
@@ -1521,6 +1660,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1521
1660
  docId: doc?.id,
1522
1661
  error: diag?.errorMessage
1523
1662
  });
1663
+ this.emitCrawlPage(config, {
1664
+ url,
1665
+ event: "done",
1666
+ status: crawlSt,
1667
+ error: diag?.errorMessage
1668
+ });
1524
1669
  return { kind: "doc", doc, url };
1525
1670
  } catch (error) {
1526
1671
  const msg = error instanceof Error ? error.message : String(error);
@@ -1529,6 +1674,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1529
1674
  url,
1530
1675
  urlNormalized,
1531
1676
  agentId,
1677
+ ingestionId,
1532
1678
  status: "error",
1533
1679
  errorMessage: msg
1534
1680
  });
@@ -1539,6 +1685,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1539
1685
  status: "error",
1540
1686
  error: msg
1541
1687
  });
1688
+ this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
1542
1689
  throw { url, error };
1543
1690
  }
1544
1691
  })
@@ -1561,12 +1708,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
1561
1708
  });
1562
1709
  }
1563
1710
  }
1711
+ this.emitCrawlProgress(config, {
1712
+ phase: "crawling",
1713
+ urlsScheduled: uniqueUrls.length,
1714
+ pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
1715
+ });
1564
1716
  if (i + concurrency < uniqueUrls.length) {
1565
1717
  await this.delay(delayMs);
1566
1718
  }
1567
1719
  }
1568
1720
  if (documents.length > 0) {
1569
- const ingestResult = await this.ingest(documents, options);
1721
+ const ingestResult = await this.ingest(documents, {
1722
+ ...options,
1723
+ metadata: {
1724
+ ...options?.metadata ?? {},
1725
+ onCrawlProgress: config.metadata?.onCrawlProgress
1726
+ }
1727
+ });
1570
1728
  indexed = ingestResult.indexed;
1571
1729
  if (ingestResult.errors) {
1572
1730
  errors.push(...ingestResult.errors);
@@ -1957,6 +2115,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
1957
2115
  }
1958
2116
  return Array.from(found);
1959
2117
  }
2118
+ emitBulkProgress(options, update) {
2119
+ const fn = options?.metadata?.onBulkProgress;
2120
+ if (!fn) return;
2121
+ try {
2122
+ fn(update);
2123
+ } catch {
2124
+ }
2125
+ }
2126
+ emitCrawlProgress(config, update) {
2127
+ const fn = config.metadata?.onCrawlProgress;
2128
+ if (!fn) return;
2129
+ try {
2130
+ fn(update);
2131
+ } catch {
2132
+ }
2133
+ }
2134
+ emitCrawlPage(config, event) {
2135
+ const fn = config.metadata?.onCrawlPage;
2136
+ if (!fn) return;
2137
+ try {
2138
+ fn(event);
2139
+ } catch {
2140
+ }
2141
+ }
1960
2142
  createDebugCollector(debug) {
1961
2143
  const enabled = !!debug?.enabled;
1962
2144
  const level = debug?.level || "summary";
@@ -2245,6 +2427,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2245
2427
  filterableFields: this.config.filterableFields,
2246
2428
  typeBoosts: this.config.typeBoosts,
2247
2429
  recencyBoost: this.config.recencyBoost,
2430
+ crawlLedger: this.config.crawlLedger,
2248
2431
  priority: this.priority
2249
2432
  };
2250
2433
  }
package/dist/index.mjs CHANGED
@@ -4,6 +4,24 @@ import OpenAI from "openai";
4
4
  import * as cheerio from "cheerio";
5
5
  import * as fs from "fs";
6
6
  import * as path from "path";
7
+ function bulkOpCurrentUrl(op) {
8
+ const meta = op.document?.metadata;
9
+ if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
10
+ if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
11
+ return void 0;
12
+ }
13
+ function isUrlListingInsert(document) {
14
+ const meta = document.metadata;
15
+ if (meta?.type !== "url") return false;
16
+ const url = typeof meta.url === "string" ? meta.url.trim() : "";
17
+ if (!url) return false;
18
+ try {
19
+ const parsed = new URL(url);
20
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
21
+ } catch {
22
+ return false;
23
+ }
24
+ }
7
25
  var WebRAGPlugin = class _WebRAGPlugin {
8
26
  name = "web-rag";
9
27
  type = "rag";
@@ -42,6 +60,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
42
60
  }
43
61
  return this.db.collection(this.config.collection);
44
62
  }
63
+ ledgerIndexesEnsured = false;
45
64
  async getLedgerCollection() {
46
65
  if (!this.client) {
47
66
  this.client = new MongoClient(this.config.mongoUri);
@@ -49,7 +68,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
49
68
  this.db = this.client.db(this.config.dbName);
50
69
  }
51
70
  const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
52
- return this.db.collection(name);
71
+ const col = this.db.collection(name);
72
+ if (!this.ledgerIndexesEnsured) {
73
+ this.ledgerIndexesEnsured = true;
74
+ await col.createIndex(
75
+ { tenantId: 1, agentId: 1, urlNormalized: 1 },
76
+ { unique: true }
77
+ );
78
+ await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
79
+ }
80
+ return col;
53
81
  }
54
82
  /**
55
83
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -59,6 +87,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
59
87
  const filter = { tenantId: this.config.tenantId };
60
88
  filter.agentId = options.agentId ?? "shared";
61
89
  if (options.domain) filter.domain = options.domain;
90
+ if (options.ingestionId) filter.ingestionId = options.ingestionId;
62
91
  if (options.status) filter.lastStatus = options.status;
63
92
  const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
64
93
  const skip = Math.max(options.skip ?? 0, 0);
@@ -127,6 +156,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
127
156
  lastCrawledAt: now,
128
157
  updatedAt: now
129
158
  };
159
+ if (params.ingestionId) {
160
+ $set.ingestionId = params.ingestionId;
161
+ }
130
162
  if (errMsg !== void 0) {
131
163
  $set.errorMessage = errMsg;
132
164
  } else if (params.status === "indexed" && params.doc) {
@@ -139,9 +171,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
139
171
  $set.docId = params.doc.id;
140
172
  } else {
141
173
  $set.modeUsed = params.diag?.modeUsed;
142
- $set.contentLength = null;
143
- $set.title = null;
144
- $set.docId = null;
174
+ $set.contentLength = params.contentLength ?? null;
175
+ $set.title = params.title ?? null;
176
+ $set.docId = params.docId ?? null;
145
177
  }
146
178
  await col.updateOne(
147
179
  {
@@ -385,9 +417,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
385
417
  let indexed = 0;
386
418
  const errors = [];
387
419
  const agentId = options?.agentId || "shared";
388
- for (const doc of documents) {
420
+ const onCrawlProgress = options?.metadata?.onCrawlProgress;
421
+ const indexingTotal = documents.length;
422
+ const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
423
+ const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
424
+ let chunksProcessed = 0;
425
+ if (onCrawlProgress && indexingTotal > 0) {
426
+ this.emitCrawlProgress(
427
+ { metadata: options?.metadata },
428
+ {
429
+ phase: "indexing",
430
+ urlsScheduled: indexingTotal,
431
+ pagesProcessed: 0,
432
+ chunksTotal,
433
+ chunksProcessed: 0
434
+ }
435
+ );
436
+ }
437
+ for (let docIndex = 0; docIndex < documents.length; docIndex++) {
438
+ const doc = documents[docIndex];
439
+ const chunks = chunkPlan[docIndex];
389
440
  try {
390
- const chunks = this.chunkContent(doc.content);
391
441
  const isChunked = chunks.length > 1;
392
442
  if (isChunked) {
393
443
  await collection.deleteMany({
@@ -422,6 +472,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
422
472
  },
423
473
  { upsert: true }
424
474
  );
475
+ chunksProcessed++;
476
+ if (onCrawlProgress) {
477
+ this.emitCrawlProgress(
478
+ { metadata: options?.metadata },
479
+ {
480
+ phase: "indexing",
481
+ urlsScheduled: indexingTotal,
482
+ pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
483
+ chunksTotal,
484
+ chunksProcessed
485
+ }
486
+ );
487
+ }
425
488
  }
426
489
  indexed++;
427
490
  } catch (error) {
@@ -501,23 +564,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
501
564
  let deleted = 0;
502
565
  let failed = 0;
503
566
  const errors = [];
567
+ const opsTotal = operations.length;
568
+ let opsDone = 0;
569
+ const ingestOptions = options ?? {};
570
+ this.emitBulkProgress(ingestOptions, {
571
+ phase: "processing",
572
+ opsTotal,
573
+ opsDone: 0
574
+ });
504
575
  for (const op of operations) {
576
+ const currentUrl = bulkOpCurrentUrl(op);
505
577
  try {
506
578
  switch (op.type) {
507
579
  case "insert":
508
580
  if (op.document) {
509
- await this.ingest([op.document], options);
510
- inserted++;
581
+ if (isUrlListingInsert(op.document)) {
582
+ const url = bulkOpCurrentUrl(op);
583
+ const crawlResult = await this.ingestSinglePageFromUrl(
584
+ {
585
+ url,
586
+ metadata: {
587
+ ...op.document.metadata ?? {},
588
+ url
589
+ }
590
+ },
591
+ ingestOptions
592
+ );
593
+ if (crawlResult.indexed > 0) {
594
+ inserted++;
595
+ } else {
596
+ failed++;
597
+ const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
598
+ errors.push({
599
+ id: op.id,
600
+ operation: op.type,
601
+ error: err
602
+ });
603
+ }
604
+ } else {
605
+ await this.ingest([op.document], ingestOptions);
606
+ inserted++;
607
+ }
511
608
  }
512
609
  break;
513
610
  case "update":
514
611
  if (op.document) {
515
- await this.update(op.id, op.document, options);
612
+ await this.update(op.id, op.document, ingestOptions);
516
613
  updated++;
517
614
  }
518
615
  break;
519
616
  case "delete":
520
- const count = await this.delete(op.id, options);
617
+ const count = await this.delete(op.id, ingestOptions);
521
618
  deleted += count;
522
619
  break;
523
620
  }
@@ -528,6 +625,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
528
625
  operation: op.type,
529
626
  error: error.message || "Unknown error"
530
627
  });
628
+ } finally {
629
+ opsDone++;
630
+ this.emitBulkProgress(ingestOptions, {
631
+ phase: "processing",
632
+ opsTotal,
633
+ opsDone,
634
+ currentOpType: op.type,
635
+ ...currentUrl ? { currentUrl } : {}
636
+ });
531
637
  }
532
638
  }
533
639
  return {
@@ -1094,6 +1200,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1094
1200
  };
1095
1201
  }
1096
1202
  const dbg = this.createDebugCollector(config.debug);
1203
+ this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
1097
1204
  const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1098
1205
  if (!base) {
1099
1206
  return {
@@ -1125,6 +1232,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
1125
1232
  if (config.excludePatterns?.length) {
1126
1233
  filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1127
1234
  }
1235
+ this.emitCrawlProgress(config, {
1236
+ phase: "discovering",
1237
+ urlsDiscovered: filteredUrls.length
1238
+ });
1128
1239
  urlsToCrawl = filteredUrls.slice(0, maxPages);
1129
1240
  urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1130
1241
  break;
@@ -1146,7 +1257,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
1146
1257
  urlsToCrawl = discovery.urls;
1147
1258
  urlsSkipped = discovery.skipped;
1148
1259
  dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1260
+ this.emitCrawlProgress(config, {
1261
+ phase: "discovering",
1262
+ urlsDiscovered: urlsToCrawl.length
1263
+ });
1149
1264
  }
1265
+ this.emitCrawlProgress(config, {
1266
+ phase: "crawling",
1267
+ urlsDiscovered: urlsToCrawl.length,
1268
+ urlsScheduled: urlsToCrawl.length
1269
+ });
1150
1270
  const result = await this.crawlUrls(urlsToCrawl, {
1151
1271
  contentSelector: config.contentSelector,
1152
1272
  titleSelector: config.titleSelector,
@@ -1168,9 +1288,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1168
1288
  return {
1169
1289
  ...result,
1170
1290
  urlsSkipped,
1291
+ /** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
1292
+ urlsScheduled: urlsToCrawl.length,
1171
1293
  crawledAt: /* @__PURE__ */ new Date(),
1172
1294
  metadata: {
1173
1295
  ...result.metadata || {},
1296
+ urlsScheduled: urlsToCrawl.length,
1174
1297
  discoveryDebug: dbg.summary()
1175
1298
  }
1176
1299
  };
@@ -1398,6 +1521,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1398
1521
  const forceRecrawl = !!(options && options.forceRecrawl);
1399
1522
  const agentId = options?.agentId ?? "shared";
1400
1523
  const stripQ = config.stripQueryParams ?? false;
1524
+ const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
1401
1525
  const urlByNorm = /* @__PURE__ */ new Map();
1402
1526
  for (const u of urls) {
1403
1527
  const norm = this.normalizeLedgerUrl(u, stripQ) || u;
@@ -1426,6 +1550,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1426
1550
  const results = await Promise.allSettled(
1427
1551
  batch.map(async (url) => {
1428
1552
  const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1553
+ this.emitCrawlPage(config, { url, event: "start" });
1429
1554
  if (ledgerOpts && !forceRecrawl) {
1430
1555
  const entry = await this.findLedgerEntry(urlNormalized, agentId);
1431
1556
  if (this.shouldSkipLedger(
@@ -1446,6 +1571,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
1446
1571
  docId: entry?.docId
1447
1572
  });
1448
1573
  dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1574
+ if (ledgerOpts) {
1575
+ await this.upsertLedgerRecord({
1576
+ url,
1577
+ urlNormalized,
1578
+ agentId,
1579
+ ingestionId,
1580
+ status: "skipped_ledger",
1581
+ title: entry?.title,
1582
+ docId: entry?.docId,
1583
+ contentLength: entry?.contentLength
1584
+ });
1585
+ }
1586
+ this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
1449
1587
  return { kind: "ledger_skip", url };
1450
1588
  }
1451
1589
  }
@@ -1469,6 +1607,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1469
1607
  url,
1470
1608
  urlNormalized,
1471
1609
  agentId,
1610
+ ingestionId,
1472
1611
  status: crawlSt,
1473
1612
  doc,
1474
1613
  diag
@@ -1485,6 +1624,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1485
1624
  docId: doc?.id,
1486
1625
  error: diag?.errorMessage
1487
1626
  });
1627
+ this.emitCrawlPage(config, {
1628
+ url,
1629
+ event: "done",
1630
+ status: crawlSt,
1631
+ error: diag?.errorMessage
1632
+ });
1488
1633
  return { kind: "doc", doc, url };
1489
1634
  } catch (error) {
1490
1635
  const msg = error instanceof Error ? error.message : String(error);
@@ -1493,6 +1638,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1493
1638
  url,
1494
1639
  urlNormalized,
1495
1640
  agentId,
1641
+ ingestionId,
1496
1642
  status: "error",
1497
1643
  errorMessage: msg
1498
1644
  });
@@ -1503,6 +1649,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1503
1649
  status: "error",
1504
1650
  error: msg
1505
1651
  });
1652
+ this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
1506
1653
  throw { url, error };
1507
1654
  }
1508
1655
  })
@@ -1525,12 +1672,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
1525
1672
  });
1526
1673
  }
1527
1674
  }
1675
+ this.emitCrawlProgress(config, {
1676
+ phase: "crawling",
1677
+ urlsScheduled: uniqueUrls.length,
1678
+ pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
1679
+ });
1528
1680
  if (i + concurrency < uniqueUrls.length) {
1529
1681
  await this.delay(delayMs);
1530
1682
  }
1531
1683
  }
1532
1684
  if (documents.length > 0) {
1533
- const ingestResult = await this.ingest(documents, options);
1685
+ const ingestResult = await this.ingest(documents, {
1686
+ ...options,
1687
+ metadata: {
1688
+ ...options?.metadata ?? {},
1689
+ onCrawlProgress: config.metadata?.onCrawlProgress
1690
+ }
1691
+ });
1534
1692
  indexed = ingestResult.indexed;
1535
1693
  if (ingestResult.errors) {
1536
1694
  errors.push(...ingestResult.errors);
@@ -1921,6 +2079,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
1921
2079
  }
1922
2080
  return Array.from(found);
1923
2081
  }
2082
+ emitBulkProgress(options, update) {
2083
+ const fn = options?.metadata?.onBulkProgress;
2084
+ if (!fn) return;
2085
+ try {
2086
+ fn(update);
2087
+ } catch {
2088
+ }
2089
+ }
2090
+ emitCrawlProgress(config, update) {
2091
+ const fn = config.metadata?.onCrawlProgress;
2092
+ if (!fn) return;
2093
+ try {
2094
+ fn(update);
2095
+ } catch {
2096
+ }
2097
+ }
2098
+ emitCrawlPage(config, event) {
2099
+ const fn = config.metadata?.onCrawlPage;
2100
+ if (!fn) return;
2101
+ try {
2102
+ fn(event);
2103
+ } catch {
2104
+ }
2105
+ }
1924
2106
  createDebugCollector(debug) {
1925
2107
  const enabled = !!debug?.enabled;
1926
2108
  const level = debug?.level || "summary";
@@ -2209,6 +2391,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
2209
2391
  filterableFields: this.config.filterableFields,
2210
2392
  typeBoosts: this.config.typeBoosts,
2211
2393
  recencyBoost: this.config.recencyBoost,
2394
+ crawlLedger: this.config.crawlLedger,
2212
2395
  priority: this.priority
2213
2396
  };
2214
2397
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.4",
3
+ "version": "0.1.5",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",