@snap-agent/rag-web 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -26,6 +26,8 @@ interface WebDocument {
26
26
  interface StoredWebDocument extends WebDocument {
27
27
  tenantId: string;
28
28
  agentId?: string;
29
+ documentId?: string;
30
+ chunkIndex?: number;
29
31
  embedding: number[];
30
32
  createdAt: Date;
31
33
  updatedAt?: Date;
@@ -52,6 +54,8 @@ interface WebRAGConfig {
52
54
  decayDays: number;
53
55
  maxBoost?: number;
54
56
  };
57
+ maxChunkSize?: number;
58
+ chunkOverlap?: number;
55
59
  cache?: {
56
60
  embeddings?: {
57
61
  enabled: boolean;
@@ -456,16 +460,23 @@ declare class WebRAGPlugin implements RAGPlugin {
456
460
  private vectorSearch;
457
461
  private generateEmbedding;
458
462
  private generateEmbeddingsBatch;
463
+ /**
464
+ * Split content into chunks by paragraph boundaries, respecting maxChunkSize.
465
+ * Returns the original content as a single chunk when chunking is disabled
466
+ * (maxChunkSize === 0) or the content fits within maxChunkSize.
467
+ */
468
+ private chunkContent;
459
469
  /**
460
470
  * Ingest documents into the CMS RAG system
461
471
  */
462
472
  ingest(documents: RAGDocument[], options?: IngestOptions): Promise<IngestResult>;
463
473
  /**
464
- * Update a single document
474
+ * Update a single document.
475
+ * When content changes the document is re-chunked (old chunks removed, new ones inserted).
465
476
  */
466
477
  update(id: string, document: Partial<RAGDocument>, options?: IngestOptions): Promise<void>;
467
478
  /**
468
- * Delete document(s) by ID
479
+ * Delete document(s) by ID — also removes any chunks belonging to the document.
469
480
  */
470
481
  delete(ids: string | string[], options?: IngestOptions): Promise<number>;
471
482
  /**
package/dist/index.d.ts CHANGED
@@ -26,6 +26,8 @@ interface WebDocument {
26
26
  interface StoredWebDocument extends WebDocument {
27
27
  tenantId: string;
28
28
  agentId?: string;
29
+ documentId?: string;
30
+ chunkIndex?: number;
29
31
  embedding: number[];
30
32
  createdAt: Date;
31
33
  updatedAt?: Date;
@@ -52,6 +54,8 @@ interface WebRAGConfig {
52
54
  decayDays: number;
53
55
  maxBoost?: number;
54
56
  };
57
+ maxChunkSize?: number;
58
+ chunkOverlap?: number;
55
59
  cache?: {
56
60
  embeddings?: {
57
61
  enabled: boolean;
@@ -456,16 +460,23 @@ declare class WebRAGPlugin implements RAGPlugin {
456
460
  private vectorSearch;
457
461
  private generateEmbedding;
458
462
  private generateEmbeddingsBatch;
463
+ /**
464
+ * Split content into chunks by paragraph boundaries, respecting maxChunkSize.
465
+ * Returns the original content as a single chunk when chunking is disabled
466
+ * (maxChunkSize === 0) or the content fits within maxChunkSize.
467
+ */
468
+ private chunkContent;
459
469
  /**
460
470
  * Ingest documents into the CMS RAG system
461
471
  */
462
472
  ingest(documents: RAGDocument[], options?: IngestOptions): Promise<IngestResult>;
463
473
  /**
464
- * Update a single document
474
+ * Update a single document.
475
+ * When content changes the document is re-chunked (old chunks removed, new ones inserted).
465
476
  */
466
477
  update(id: string, document: Partial<RAGDocument>, options?: IngestOptions): Promise<void>;
467
478
  /**
468
- * Delete document(s) by ID
479
+ * Delete document(s) by ID — also removes any chunks belonging to the document.
469
480
  */
470
481
  delete(ids: string | string[], options?: IngestOptions): Promise<number>;
471
482
  /**
package/dist/index.js CHANGED
@@ -60,6 +60,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
60
60
  limit: 10,
61
61
  minScore: 0.7,
62
62
  filterableFields: ["type"],
63
+ maxChunkSize: 1500,
64
+ chunkOverlap: 200,
63
65
  ...config
64
66
  };
65
67
  this.priority = config.priority ?? 100;
@@ -361,6 +363,52 @@ var WebRAGPlugin = class _WebRAGPlugin {
361
363
  return embeddings;
362
364
  }
363
365
  // ============================================================================
366
+ // Chunking
367
+ // ============================================================================
368
+ /**
369
+ * Split content into chunks by paragraph boundaries, respecting maxChunkSize.
370
+ * Returns the original content as a single chunk when chunking is disabled
371
+ * (maxChunkSize === 0) or the content fits within maxChunkSize.
372
+ */
373
+ chunkContent(content) {
374
+ const maxSize = this.config.maxChunkSize ?? 1500;
375
+ if (maxSize === 0 || content.length <= maxSize) {
376
+ return [content];
377
+ }
378
+ const overlap = this.config.chunkOverlap ?? 200;
379
+ const paragraphs = content.split(/\n\n+/);
380
+ const chunks = [];
381
+ let current = "";
382
+ for (const para of paragraphs) {
383
+ const trimmed = para.trim();
384
+ if (!trimmed) continue;
385
+ if (trimmed.length > maxSize) {
386
+ if (current.trim()) {
387
+ chunks.push(current.trim());
388
+ current = "";
389
+ }
390
+ for (let i = 0; i < trimmed.length; i += maxSize - overlap) {
391
+ const slice = trimmed.slice(i, i + maxSize);
392
+ if (slice.trim()) chunks.push(slice.trim());
393
+ }
394
+ continue;
395
+ }
396
+ const candidate = current ? current + "\n\n" + trimmed : trimmed;
397
+ if (candidate.length > maxSize) {
398
+ if (current.trim()) {
399
+ chunks.push(current.trim());
400
+ }
401
+ current = trimmed;
402
+ } else {
403
+ current = candidate;
404
+ }
405
+ }
406
+ if (current.trim()) {
407
+ chunks.push(current.trim());
408
+ }
409
+ return chunks.length > 0 ? chunks : [content];
410
+ }
411
+ // ============================================================================
364
412
  // Document Ingestion
365
413
  // ============================================================================
366
414
  /**
@@ -370,47 +418,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
370
418
  const collection = await this.getCollection();
371
419
  let indexed = 0;
372
420
  const errors = [];
373
- const batchSize = options?.batchSize ?? 10;
374
- for (let i = 0; i < documents.length; i += batchSize) {
375
- const batch = documents.slice(i, i + batchSize);
376
- const embeddings = await this.generateEmbeddingsBatch(
377
- batch.map((doc) => doc.content)
378
- );
379
- const docsToStore = batch.map((doc, idx) => ({
380
- id: doc.id,
381
- content: doc.content,
382
- metadata: {
383
- type: doc.metadata?.type || "content",
384
- ...doc.metadata
385
- },
386
- tenantId: this.config.tenantId,
387
- // Use 'shared' marker for tenant-wide content, specific agentId for agent-only
388
- agentId: options?.agentId || "shared",
389
- embedding: embeddings[idx]
390
- }));
391
- for (const doc of docsToStore) {
392
- try {
393
- const filter = {
421
+ const agentId = options?.agentId || "shared";
422
+ for (const doc of documents) {
423
+ try {
424
+ const chunks = this.chunkContent(doc.content);
425
+ const isChunked = chunks.length > 1;
426
+ if (isChunked) {
427
+ await collection.deleteMany({
394
428
  tenantId: this.config.tenantId,
395
- id: doc.id,
396
- // Match by agentId ('shared' for tenant-wide, specific for agent-only)
397
- agentId: options?.agentId || "shared"
429
+ documentId: doc.id,
430
+ agentId
431
+ });
432
+ }
433
+ for (let i = 0; i < chunks.length; i++) {
434
+ const chunkId = isChunked ? `chunk-${doc.id}-${i}` : doc.id;
435
+ const embedding = await this.generateEmbedding(chunks[i]);
436
+ const storedDoc = {
437
+ id: chunkId,
438
+ content: chunks[i],
439
+ metadata: {
440
+ type: doc.metadata?.type || "content",
441
+ ...doc.metadata
442
+ },
443
+ tenantId: this.config.tenantId,
444
+ agentId,
445
+ embedding
398
446
  };
447
+ if (isChunked) {
448
+ storedDoc.documentId = doc.id;
449
+ storedDoc.chunkIndex = i;
450
+ }
399
451
  await collection.updateOne(
400
- filter,
452
+ { tenantId: this.config.tenantId, id: chunkId, agentId },
401
453
  {
402
- $set: { ...doc, updatedAt: /* @__PURE__ */ new Date() },
454
+ $set: { ...storedDoc, updatedAt: /* @__PURE__ */ new Date() },
403
455
  $setOnInsert: { createdAt: /* @__PURE__ */ new Date() }
404
456
  },
405
457
  { upsert: true }
406
458
  );
407
- indexed++;
408
- } catch (error) {
409
- errors.push({
410
- id: doc.id,
411
- error: error instanceof Error ? error.message : "Unknown error"
412
- });
413
459
  }
460
+ indexed++;
461
+ } catch (error) {
462
+ errors.push({
463
+ id: doc.id,
464
+ error: error instanceof Error ? error.message : "Unknown error"
465
+ });
414
466
  }
415
467
  }
416
468
  return {
@@ -425,40 +477,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
425
477
  };
426
478
  }
427
479
  /**
428
- * Update a single document
480
+ * Update a single document.
481
+ * When content changes the document is re-chunked (old chunks removed, new ones inserted).
429
482
  */
430
483
  async update(id, document, options) {
431
- const collection = await this.getCollection();
432
- const update = { updatedAt: /* @__PURE__ */ new Date() };
484
+ const agentId = options?.agentId || "shared";
433
485
  if (document.content) {
434
- const embedding = await this.generateEmbedding(document.content);
435
- update.content = document.content;
436
- update.embedding = embedding;
486
+ const fullDoc = {
487
+ id,
488
+ content: document.content,
489
+ metadata: document.metadata ?? { type: "content" }
490
+ };
491
+ await this.delete(id, options);
492
+ await this.ingest([fullDoc], options);
493
+ return;
437
494
  }
495
+ const collection = await this.getCollection();
496
+ const metaUpdate = { updatedAt: /* @__PURE__ */ new Date() };
438
497
  if (document.metadata) {
439
498
  for (const [key, value] of Object.entries(document.metadata)) {
440
- update[`metadata.${key}`] = value;
499
+ metaUpdate[`metadata.${key}`] = value;
441
500
  }
442
501
  }
443
- const filter = {
444
- tenantId: this.config.tenantId,
445
- id,
446
- // Match by agentId ('shared' for tenant-wide, specific for agent-only)
447
- agentId: options?.agentId || "shared"
448
- };
449
- await collection.updateOne(filter, { $set: update });
502
+ await collection.updateMany(
503
+ {
504
+ tenantId: this.config.tenantId,
505
+ agentId,
506
+ $or: [{ id }, { documentId: id }]
507
+ },
508
+ { $set: metaUpdate }
509
+ );
450
510
  }
451
511
  /**
452
- * Delete document(s) by ID
512
+ * Delete document(s) by ID — also removes any chunks belonging to the document.
453
513
  */
454
514
  async delete(ids, options) {
455
515
  const collection = await this.getCollection();
456
516
  const idArray = Array.isArray(ids) ? ids : [ids];
457
517
  const filter = {
458
518
  tenantId: this.config.tenantId,
459
- id: { $in: idArray },
460
- // Match by agentId ('shared' for tenant-wide, specific for agent-only)
461
- agentId: options?.agentId || "shared"
519
+ agentId: options?.agentId || "shared",
520
+ // Match the document itself (id) OR any chunks that belong to it (documentId)
521
+ $or: [
522
+ { id: { $in: idArray } },
523
+ { documentId: { $in: idArray } }
524
+ ]
462
525
  };
463
526
  const result = await collection.deleteMany(filter);
464
527
  return result.deletedCount;
package/dist/index.mjs CHANGED
@@ -24,6 +24,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
24
24
  limit: 10,
25
25
  minScore: 0.7,
26
26
  filterableFields: ["type"],
27
+ maxChunkSize: 1500,
28
+ chunkOverlap: 200,
27
29
  ...config
28
30
  };
29
31
  this.priority = config.priority ?? 100;
@@ -325,6 +327,52 @@ var WebRAGPlugin = class _WebRAGPlugin {
325
327
  return embeddings;
326
328
  }
327
329
  // ============================================================================
330
+ // Chunking
331
+ // ============================================================================
332
+ /**
333
+ * Split content into chunks by paragraph boundaries, respecting maxChunkSize.
334
+ * Returns the original content as a single chunk when chunking is disabled
335
+ * (maxChunkSize === 0) or the content fits within maxChunkSize.
336
+ */
337
+ chunkContent(content) {
338
+ const maxSize = this.config.maxChunkSize ?? 1500;
339
+ if (maxSize === 0 || content.length <= maxSize) {
340
+ return [content];
341
+ }
342
+ const overlap = this.config.chunkOverlap ?? 200;
343
+ const paragraphs = content.split(/\n\n+/);
344
+ const chunks = [];
345
+ let current = "";
346
+ for (const para of paragraphs) {
347
+ const trimmed = para.trim();
348
+ if (!trimmed) continue;
349
+ if (trimmed.length > maxSize) {
350
+ if (current.trim()) {
351
+ chunks.push(current.trim());
352
+ current = "";
353
+ }
354
+ for (let i = 0; i < trimmed.length; i += maxSize - overlap) {
355
+ const slice = trimmed.slice(i, i + maxSize);
356
+ if (slice.trim()) chunks.push(slice.trim());
357
+ }
358
+ continue;
359
+ }
360
+ const candidate = current ? current + "\n\n" + trimmed : trimmed;
361
+ if (candidate.length > maxSize) {
362
+ if (current.trim()) {
363
+ chunks.push(current.trim());
364
+ }
365
+ current = trimmed;
366
+ } else {
367
+ current = candidate;
368
+ }
369
+ }
370
+ if (current.trim()) {
371
+ chunks.push(current.trim());
372
+ }
373
+ return chunks.length > 0 ? chunks : [content];
374
+ }
375
+ // ============================================================================
328
376
  // Document Ingestion
329
377
  // ============================================================================
330
378
  /**
@@ -334,47 +382,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
334
382
  const collection = await this.getCollection();
335
383
  let indexed = 0;
336
384
  const errors = [];
337
- const batchSize = options?.batchSize ?? 10;
338
- for (let i = 0; i < documents.length; i += batchSize) {
339
- const batch = documents.slice(i, i + batchSize);
340
- const embeddings = await this.generateEmbeddingsBatch(
341
- batch.map((doc) => doc.content)
342
- );
343
- const docsToStore = batch.map((doc, idx) => ({
344
- id: doc.id,
345
- content: doc.content,
346
- metadata: {
347
- type: doc.metadata?.type || "content",
348
- ...doc.metadata
349
- },
350
- tenantId: this.config.tenantId,
351
- // Use 'shared' marker for tenant-wide content, specific agentId for agent-only
352
- agentId: options?.agentId || "shared",
353
- embedding: embeddings[idx]
354
- }));
355
- for (const doc of docsToStore) {
356
- try {
357
- const filter = {
385
+ const agentId = options?.agentId || "shared";
386
+ for (const doc of documents) {
387
+ try {
388
+ const chunks = this.chunkContent(doc.content);
389
+ const isChunked = chunks.length > 1;
390
+ if (isChunked) {
391
+ await collection.deleteMany({
358
392
  tenantId: this.config.tenantId,
359
- id: doc.id,
360
- // Match by agentId ('shared' for tenant-wide, specific for agent-only)
361
- agentId: options?.agentId || "shared"
393
+ documentId: doc.id,
394
+ agentId
395
+ });
396
+ }
397
+ for (let i = 0; i < chunks.length; i++) {
398
+ const chunkId = isChunked ? `chunk-${doc.id}-${i}` : doc.id;
399
+ const embedding = await this.generateEmbedding(chunks[i]);
400
+ const storedDoc = {
401
+ id: chunkId,
402
+ content: chunks[i],
403
+ metadata: {
404
+ type: doc.metadata?.type || "content",
405
+ ...doc.metadata
406
+ },
407
+ tenantId: this.config.tenantId,
408
+ agentId,
409
+ embedding
362
410
  };
411
+ if (isChunked) {
412
+ storedDoc.documentId = doc.id;
413
+ storedDoc.chunkIndex = i;
414
+ }
363
415
  await collection.updateOne(
364
- filter,
416
+ { tenantId: this.config.tenantId, id: chunkId, agentId },
365
417
  {
366
- $set: { ...doc, updatedAt: /* @__PURE__ */ new Date() },
418
+ $set: { ...storedDoc, updatedAt: /* @__PURE__ */ new Date() },
367
419
  $setOnInsert: { createdAt: /* @__PURE__ */ new Date() }
368
420
  },
369
421
  { upsert: true }
370
422
  );
371
- indexed++;
372
- } catch (error) {
373
- errors.push({
374
- id: doc.id,
375
- error: error instanceof Error ? error.message : "Unknown error"
376
- });
377
423
  }
424
+ indexed++;
425
+ } catch (error) {
426
+ errors.push({
427
+ id: doc.id,
428
+ error: error instanceof Error ? error.message : "Unknown error"
429
+ });
378
430
  }
379
431
  }
380
432
  return {
@@ -389,40 +441,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
389
441
  };
390
442
  }
391
443
  /**
392
- * Update a single document
444
+ * Update a single document.
445
+ * When content changes the document is re-chunked (old chunks removed, new ones inserted).
393
446
  */
394
447
  async update(id, document, options) {
395
- const collection = await this.getCollection();
396
- const update = { updatedAt: /* @__PURE__ */ new Date() };
448
+ const agentId = options?.agentId || "shared";
397
449
  if (document.content) {
398
- const embedding = await this.generateEmbedding(document.content);
399
- update.content = document.content;
400
- update.embedding = embedding;
450
+ const fullDoc = {
451
+ id,
452
+ content: document.content,
453
+ metadata: document.metadata ?? { type: "content" }
454
+ };
455
+ await this.delete(id, options);
456
+ await this.ingest([fullDoc], options);
457
+ return;
401
458
  }
459
+ const collection = await this.getCollection();
460
+ const metaUpdate = { updatedAt: /* @__PURE__ */ new Date() };
402
461
  if (document.metadata) {
403
462
  for (const [key, value] of Object.entries(document.metadata)) {
404
- update[`metadata.${key}`] = value;
463
+ metaUpdate[`metadata.${key}`] = value;
405
464
  }
406
465
  }
407
- const filter = {
408
- tenantId: this.config.tenantId,
409
- id,
410
- // Match by agentId ('shared' for tenant-wide, specific for agent-only)
411
- agentId: options?.agentId || "shared"
412
- };
413
- await collection.updateOne(filter, { $set: update });
466
+ await collection.updateMany(
467
+ {
468
+ tenantId: this.config.tenantId,
469
+ agentId,
470
+ $or: [{ id }, { documentId: id }]
471
+ },
472
+ { $set: metaUpdate }
473
+ );
414
474
  }
415
475
  /**
416
- * Delete document(s) by ID
476
+ * Delete document(s) by ID — also removes any chunks belonging to the document.
417
477
  */
418
478
  async delete(ids, options) {
419
479
  const collection = await this.getCollection();
420
480
  const idArray = Array.isArray(ids) ? ids : [ids];
421
481
  const filter = {
422
482
  tenantId: this.config.tenantId,
423
- id: { $in: idArray },
424
- // Match by agentId ('shared' for tenant-wide, specific for agent-only)
425
- agentId: options?.agentId || "shared"
483
+ agentId: options?.agentId || "shared",
484
+ // Match the document itself (id) OR any chunks that belong to it (documentId)
485
+ $or: [
486
+ { id: { $in: idArray } },
487
+ { documentId: { $in: idArray } }
488
+ ]
426
489
  };
427
490
  const result = await collection.deleteMany(filter);
428
491
  return result.deletedCount;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@snap-agent/rag-web",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",