@snap-agent/rag-web 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +13 -2
- package/dist/index.d.ts +13 -2
- package/dist/index.js +123 -50
- package/dist/index.mjs +123 -50
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -26,6 +26,8 @@ interface WebDocument {
|
|
|
26
26
|
interface StoredWebDocument extends WebDocument {
|
|
27
27
|
tenantId: string;
|
|
28
28
|
agentId?: string;
|
|
29
|
+
documentId?: string;
|
|
30
|
+
chunkIndex?: number;
|
|
29
31
|
embedding: number[];
|
|
30
32
|
createdAt: Date;
|
|
31
33
|
updatedAt?: Date;
|
|
@@ -52,6 +54,8 @@ interface WebRAGConfig {
|
|
|
52
54
|
decayDays: number;
|
|
53
55
|
maxBoost?: number;
|
|
54
56
|
};
|
|
57
|
+
maxChunkSize?: number;
|
|
58
|
+
chunkOverlap?: number;
|
|
55
59
|
cache?: {
|
|
56
60
|
embeddings?: {
|
|
57
61
|
enabled: boolean;
|
|
@@ -456,16 +460,23 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
456
460
|
private vectorSearch;
|
|
457
461
|
private generateEmbedding;
|
|
458
462
|
private generateEmbeddingsBatch;
|
|
463
|
+
/**
|
|
464
|
+
* Split content into chunks by paragraph boundaries, respecting maxChunkSize.
|
|
465
|
+
* Returns the original content as a single chunk when chunking is disabled
|
|
466
|
+
* (maxChunkSize === 0) or the content fits within maxChunkSize.
|
|
467
|
+
*/
|
|
468
|
+
private chunkContent;
|
|
459
469
|
/**
|
|
460
470
|
* Ingest documents into the CMS RAG system
|
|
461
471
|
*/
|
|
462
472
|
ingest(documents: RAGDocument[], options?: IngestOptions): Promise<IngestResult>;
|
|
463
473
|
/**
|
|
464
|
-
* Update a single document
|
|
474
|
+
* Update a single document.
|
|
475
|
+
* When content changes the document is re-chunked (old chunks removed, new ones inserted).
|
|
465
476
|
*/
|
|
466
477
|
update(id: string, document: Partial<RAGDocument>, options?: IngestOptions): Promise<void>;
|
|
467
478
|
/**
|
|
468
|
-
* Delete document(s) by ID
|
|
479
|
+
* Delete document(s) by ID — also removes any chunks belonging to the document.
|
|
469
480
|
*/
|
|
470
481
|
delete(ids: string | string[], options?: IngestOptions): Promise<number>;
|
|
471
482
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -26,6 +26,8 @@ interface WebDocument {
|
|
|
26
26
|
interface StoredWebDocument extends WebDocument {
|
|
27
27
|
tenantId: string;
|
|
28
28
|
agentId?: string;
|
|
29
|
+
documentId?: string;
|
|
30
|
+
chunkIndex?: number;
|
|
29
31
|
embedding: number[];
|
|
30
32
|
createdAt: Date;
|
|
31
33
|
updatedAt?: Date;
|
|
@@ -52,6 +54,8 @@ interface WebRAGConfig {
|
|
|
52
54
|
decayDays: number;
|
|
53
55
|
maxBoost?: number;
|
|
54
56
|
};
|
|
57
|
+
maxChunkSize?: number;
|
|
58
|
+
chunkOverlap?: number;
|
|
55
59
|
cache?: {
|
|
56
60
|
embeddings?: {
|
|
57
61
|
enabled: boolean;
|
|
@@ -456,16 +460,23 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
456
460
|
private vectorSearch;
|
|
457
461
|
private generateEmbedding;
|
|
458
462
|
private generateEmbeddingsBatch;
|
|
463
|
+
/**
|
|
464
|
+
* Split content into chunks by paragraph boundaries, respecting maxChunkSize.
|
|
465
|
+
* Returns the original content as a single chunk when chunking is disabled
|
|
466
|
+
* (maxChunkSize === 0) or the content fits within maxChunkSize.
|
|
467
|
+
*/
|
|
468
|
+
private chunkContent;
|
|
459
469
|
/**
|
|
460
470
|
* Ingest documents into the CMS RAG system
|
|
461
471
|
*/
|
|
462
472
|
ingest(documents: RAGDocument[], options?: IngestOptions): Promise<IngestResult>;
|
|
463
473
|
/**
|
|
464
|
-
* Update a single document
|
|
474
|
+
* Update a single document.
|
|
475
|
+
* When content changes the document is re-chunked (old chunks removed, new ones inserted).
|
|
465
476
|
*/
|
|
466
477
|
update(id: string, document: Partial<RAGDocument>, options?: IngestOptions): Promise<void>;
|
|
467
478
|
/**
|
|
468
|
-
* Delete document(s) by ID
|
|
479
|
+
* Delete document(s) by ID — also removes any chunks belonging to the document.
|
|
469
480
|
*/
|
|
470
481
|
delete(ids: string | string[], options?: IngestOptions): Promise<number>;
|
|
471
482
|
/**
|
package/dist/index.js
CHANGED
|
@@ -60,6 +60,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
60
60
|
limit: 10,
|
|
61
61
|
minScore: 0.7,
|
|
62
62
|
filterableFields: ["type"],
|
|
63
|
+
maxChunkSize: 1500,
|
|
64
|
+
chunkOverlap: 200,
|
|
63
65
|
...config
|
|
64
66
|
};
|
|
65
67
|
this.priority = config.priority ?? 100;
|
|
@@ -361,6 +363,52 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
361
363
|
return embeddings;
|
|
362
364
|
}
|
|
363
365
|
// ============================================================================
|
|
366
|
+
// Chunking
|
|
367
|
+
// ============================================================================
|
|
368
|
+
/**
|
|
369
|
+
* Split content into chunks by paragraph boundaries, respecting maxChunkSize.
|
|
370
|
+
* Returns the original content as a single chunk when chunking is disabled
|
|
371
|
+
* (maxChunkSize === 0) or the content fits within maxChunkSize.
|
|
372
|
+
*/
|
|
373
|
+
chunkContent(content) {
|
|
374
|
+
const maxSize = this.config.maxChunkSize ?? 1500;
|
|
375
|
+
if (maxSize === 0 || content.length <= maxSize) {
|
|
376
|
+
return [content];
|
|
377
|
+
}
|
|
378
|
+
const overlap = this.config.chunkOverlap ?? 200;
|
|
379
|
+
const paragraphs = content.split(/\n\n+/);
|
|
380
|
+
const chunks = [];
|
|
381
|
+
let current = "";
|
|
382
|
+
for (const para of paragraphs) {
|
|
383
|
+
const trimmed = para.trim();
|
|
384
|
+
if (!trimmed) continue;
|
|
385
|
+
if (trimmed.length > maxSize) {
|
|
386
|
+
if (current.trim()) {
|
|
387
|
+
chunks.push(current.trim());
|
|
388
|
+
current = "";
|
|
389
|
+
}
|
|
390
|
+
for (let i = 0; i < trimmed.length; i += maxSize - overlap) {
|
|
391
|
+
const slice = trimmed.slice(i, i + maxSize);
|
|
392
|
+
if (slice.trim()) chunks.push(slice.trim());
|
|
393
|
+
}
|
|
394
|
+
continue;
|
|
395
|
+
}
|
|
396
|
+
const candidate = current ? current + "\n\n" + trimmed : trimmed;
|
|
397
|
+
if (candidate.length > maxSize) {
|
|
398
|
+
if (current.trim()) {
|
|
399
|
+
chunks.push(current.trim());
|
|
400
|
+
}
|
|
401
|
+
current = trimmed;
|
|
402
|
+
} else {
|
|
403
|
+
current = candidate;
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
if (current.trim()) {
|
|
407
|
+
chunks.push(current.trim());
|
|
408
|
+
}
|
|
409
|
+
return chunks.length > 0 ? chunks : [content];
|
|
410
|
+
}
|
|
411
|
+
// ============================================================================
|
|
364
412
|
// Document Ingestion
|
|
365
413
|
// ============================================================================
|
|
366
414
|
/**
|
|
@@ -370,47 +418,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
370
418
|
const collection = await this.getCollection();
|
|
371
419
|
let indexed = 0;
|
|
372
420
|
const errors = [];
|
|
373
|
-
const
|
|
374
|
-
for (
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
421
|
+
const agentId = options?.agentId || "shared";
|
|
422
|
+
for (const doc of documents) {
|
|
423
|
+
try {
|
|
424
|
+
const chunks = this.chunkContent(doc.content);
|
|
425
|
+
const isChunked = chunks.length > 1;
|
|
426
|
+
if (isChunked) {
|
|
427
|
+
await collection.deleteMany({
|
|
428
|
+
tenantId: this.config.tenantId,
|
|
429
|
+
documentId: doc.id,
|
|
430
|
+
agentId
|
|
431
|
+
});
|
|
432
|
+
}
|
|
433
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
434
|
+
const chunkId = isChunked ? `chunk-${doc.id}-${i}` : doc.id;
|
|
435
|
+
const embedding = await this.generateEmbedding(chunks[i]);
|
|
436
|
+
const storedDoc = {
|
|
437
|
+
id: chunkId,
|
|
438
|
+
content: chunks[i],
|
|
439
|
+
metadata: {
|
|
440
|
+
type: doc.metadata?.type || "content",
|
|
441
|
+
...doc.metadata
|
|
442
|
+
},
|
|
394
443
|
tenantId: this.config.tenantId,
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
agentId: options?.agentId || "shared"
|
|
444
|
+
agentId,
|
|
445
|
+
embedding
|
|
398
446
|
};
|
|
447
|
+
if (isChunked) {
|
|
448
|
+
storedDoc.documentId = doc.id;
|
|
449
|
+
storedDoc.chunkIndex = i;
|
|
450
|
+
}
|
|
399
451
|
await collection.updateOne(
|
|
400
|
-
|
|
452
|
+
{ tenantId: this.config.tenantId, id: chunkId, agentId },
|
|
401
453
|
{
|
|
402
|
-
$set: { ...
|
|
454
|
+
$set: { ...storedDoc, updatedAt: /* @__PURE__ */ new Date() },
|
|
403
455
|
$setOnInsert: { createdAt: /* @__PURE__ */ new Date() }
|
|
404
456
|
},
|
|
405
457
|
{ upsert: true }
|
|
406
458
|
);
|
|
407
|
-
indexed++;
|
|
408
|
-
} catch (error) {
|
|
409
|
-
errors.push({
|
|
410
|
-
id: doc.id,
|
|
411
|
-
error: error instanceof Error ? error.message : "Unknown error"
|
|
412
|
-
});
|
|
413
459
|
}
|
|
460
|
+
indexed++;
|
|
461
|
+
} catch (error) {
|
|
462
|
+
errors.push({
|
|
463
|
+
id: doc.id,
|
|
464
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
465
|
+
});
|
|
414
466
|
}
|
|
415
467
|
}
|
|
416
468
|
return {
|
|
@@ -425,40 +477,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
425
477
|
};
|
|
426
478
|
}
|
|
427
479
|
/**
|
|
428
|
-
* Update a single document
|
|
480
|
+
* Update a single document.
|
|
481
|
+
* When content changes the document is re-chunked (old chunks removed, new ones inserted).
|
|
429
482
|
*/
|
|
430
483
|
async update(id, document, options) {
|
|
431
|
-
const
|
|
432
|
-
const update = { updatedAt: /* @__PURE__ */ new Date() };
|
|
484
|
+
const agentId = options?.agentId || "shared";
|
|
433
485
|
if (document.content) {
|
|
434
|
-
const
|
|
435
|
-
|
|
436
|
-
|
|
486
|
+
const fullDoc = {
|
|
487
|
+
id,
|
|
488
|
+
content: document.content,
|
|
489
|
+
metadata: document.metadata ?? { type: "content" }
|
|
490
|
+
};
|
|
491
|
+
await this.delete(id, options);
|
|
492
|
+
await this.ingest([fullDoc], options);
|
|
493
|
+
return;
|
|
437
494
|
}
|
|
495
|
+
const collection = await this.getCollection();
|
|
496
|
+
const metaUpdate = { updatedAt: /* @__PURE__ */ new Date() };
|
|
438
497
|
if (document.metadata) {
|
|
439
498
|
for (const [key, value] of Object.entries(document.metadata)) {
|
|
440
|
-
|
|
499
|
+
metaUpdate[`metadata.${key}`] = value;
|
|
441
500
|
}
|
|
442
501
|
}
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
502
|
+
await collection.updateMany(
|
|
503
|
+
{
|
|
504
|
+
tenantId: this.config.tenantId,
|
|
505
|
+
agentId,
|
|
506
|
+
$or: [{ id }, { documentId: id }]
|
|
507
|
+
},
|
|
508
|
+
{ $set: metaUpdate }
|
|
509
|
+
);
|
|
450
510
|
}
|
|
451
511
|
/**
|
|
452
|
-
* Delete document(s) by ID
|
|
512
|
+
* Delete document(s) by ID — also removes any chunks belonging to the document.
|
|
453
513
|
*/
|
|
454
514
|
async delete(ids, options) {
|
|
455
515
|
const collection = await this.getCollection();
|
|
456
516
|
const idArray = Array.isArray(ids) ? ids : [ids];
|
|
457
517
|
const filter = {
|
|
458
518
|
tenantId: this.config.tenantId,
|
|
459
|
-
|
|
460
|
-
// Match
|
|
461
|
-
|
|
519
|
+
agentId: options?.agentId || "shared",
|
|
520
|
+
// Match the document itself (id) OR any chunks that belong to it (documentId)
|
|
521
|
+
$or: [
|
|
522
|
+
{ id: { $in: idArray } },
|
|
523
|
+
{ documentId: { $in: idArray } }
|
|
524
|
+
]
|
|
462
525
|
};
|
|
463
526
|
const result = await collection.deleteMany(filter);
|
|
464
527
|
return result.deletedCount;
|
|
@@ -1596,6 +1659,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1596
1659
|
const content = this.extractBestContentText($, config);
|
|
1597
1660
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1598
1661
|
if (!content || content.length < minChars) return null;
|
|
1662
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
|
|
1663
|
+
let imageUrl;
|
|
1664
|
+
if (image) {
|
|
1665
|
+
try {
|
|
1666
|
+
imageUrl = new URL(image, url).href;
|
|
1667
|
+
} catch {
|
|
1668
|
+
imageUrl = image;
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1599
1671
|
let type = config.defaultType || "page";
|
|
1600
1672
|
if (config.typeFromUrl) {
|
|
1601
1673
|
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
@@ -1613,6 +1685,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1613
1685
|
type,
|
|
1614
1686
|
title,
|
|
1615
1687
|
url,
|
|
1688
|
+
...imageUrl ? { imageUrl } : {},
|
|
1616
1689
|
...config.metadata
|
|
1617
1690
|
}
|
|
1618
1691
|
};
|
package/dist/index.mjs
CHANGED
|
@@ -24,6 +24,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
24
24
|
limit: 10,
|
|
25
25
|
minScore: 0.7,
|
|
26
26
|
filterableFields: ["type"],
|
|
27
|
+
maxChunkSize: 1500,
|
|
28
|
+
chunkOverlap: 200,
|
|
27
29
|
...config
|
|
28
30
|
};
|
|
29
31
|
this.priority = config.priority ?? 100;
|
|
@@ -325,6 +327,52 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
325
327
|
return embeddings;
|
|
326
328
|
}
|
|
327
329
|
// ============================================================================
|
|
330
|
+
// Chunking
|
|
331
|
+
// ============================================================================
|
|
332
|
+
/**
|
|
333
|
+
* Split content into chunks by paragraph boundaries, respecting maxChunkSize.
|
|
334
|
+
* Returns the original content as a single chunk when chunking is disabled
|
|
335
|
+
* (maxChunkSize === 0) or the content fits within maxChunkSize.
|
|
336
|
+
*/
|
|
337
|
+
chunkContent(content) {
|
|
338
|
+
const maxSize = this.config.maxChunkSize ?? 1500;
|
|
339
|
+
if (maxSize === 0 || content.length <= maxSize) {
|
|
340
|
+
return [content];
|
|
341
|
+
}
|
|
342
|
+
const overlap = this.config.chunkOverlap ?? 200;
|
|
343
|
+
const paragraphs = content.split(/\n\n+/);
|
|
344
|
+
const chunks = [];
|
|
345
|
+
let current = "";
|
|
346
|
+
for (const para of paragraphs) {
|
|
347
|
+
const trimmed = para.trim();
|
|
348
|
+
if (!trimmed) continue;
|
|
349
|
+
if (trimmed.length > maxSize) {
|
|
350
|
+
if (current.trim()) {
|
|
351
|
+
chunks.push(current.trim());
|
|
352
|
+
current = "";
|
|
353
|
+
}
|
|
354
|
+
for (let i = 0; i < trimmed.length; i += maxSize - overlap) {
|
|
355
|
+
const slice = trimmed.slice(i, i + maxSize);
|
|
356
|
+
if (slice.trim()) chunks.push(slice.trim());
|
|
357
|
+
}
|
|
358
|
+
continue;
|
|
359
|
+
}
|
|
360
|
+
const candidate = current ? current + "\n\n" + trimmed : trimmed;
|
|
361
|
+
if (candidate.length > maxSize) {
|
|
362
|
+
if (current.trim()) {
|
|
363
|
+
chunks.push(current.trim());
|
|
364
|
+
}
|
|
365
|
+
current = trimmed;
|
|
366
|
+
} else {
|
|
367
|
+
current = candidate;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
if (current.trim()) {
|
|
371
|
+
chunks.push(current.trim());
|
|
372
|
+
}
|
|
373
|
+
return chunks.length > 0 ? chunks : [content];
|
|
374
|
+
}
|
|
375
|
+
// ============================================================================
|
|
328
376
|
// Document Ingestion
|
|
329
377
|
// ============================================================================
|
|
330
378
|
/**
|
|
@@ -334,47 +382,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
334
382
|
const collection = await this.getCollection();
|
|
335
383
|
let indexed = 0;
|
|
336
384
|
const errors = [];
|
|
337
|
-
const
|
|
338
|
-
for (
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
385
|
+
const agentId = options?.agentId || "shared";
|
|
386
|
+
for (const doc of documents) {
|
|
387
|
+
try {
|
|
388
|
+
const chunks = this.chunkContent(doc.content);
|
|
389
|
+
const isChunked = chunks.length > 1;
|
|
390
|
+
if (isChunked) {
|
|
391
|
+
await collection.deleteMany({
|
|
392
|
+
tenantId: this.config.tenantId,
|
|
393
|
+
documentId: doc.id,
|
|
394
|
+
agentId
|
|
395
|
+
});
|
|
396
|
+
}
|
|
397
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
398
|
+
const chunkId = isChunked ? `chunk-${doc.id}-${i}` : doc.id;
|
|
399
|
+
const embedding = await this.generateEmbedding(chunks[i]);
|
|
400
|
+
const storedDoc = {
|
|
401
|
+
id: chunkId,
|
|
402
|
+
content: chunks[i],
|
|
403
|
+
metadata: {
|
|
404
|
+
type: doc.metadata?.type || "content",
|
|
405
|
+
...doc.metadata
|
|
406
|
+
},
|
|
358
407
|
tenantId: this.config.tenantId,
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
agentId: options?.agentId || "shared"
|
|
408
|
+
agentId,
|
|
409
|
+
embedding
|
|
362
410
|
};
|
|
411
|
+
if (isChunked) {
|
|
412
|
+
storedDoc.documentId = doc.id;
|
|
413
|
+
storedDoc.chunkIndex = i;
|
|
414
|
+
}
|
|
363
415
|
await collection.updateOne(
|
|
364
|
-
|
|
416
|
+
{ tenantId: this.config.tenantId, id: chunkId, agentId },
|
|
365
417
|
{
|
|
366
|
-
$set: { ...
|
|
418
|
+
$set: { ...storedDoc, updatedAt: /* @__PURE__ */ new Date() },
|
|
367
419
|
$setOnInsert: { createdAt: /* @__PURE__ */ new Date() }
|
|
368
420
|
},
|
|
369
421
|
{ upsert: true }
|
|
370
422
|
);
|
|
371
|
-
indexed++;
|
|
372
|
-
} catch (error) {
|
|
373
|
-
errors.push({
|
|
374
|
-
id: doc.id,
|
|
375
|
-
error: error instanceof Error ? error.message : "Unknown error"
|
|
376
|
-
});
|
|
377
423
|
}
|
|
424
|
+
indexed++;
|
|
425
|
+
} catch (error) {
|
|
426
|
+
errors.push({
|
|
427
|
+
id: doc.id,
|
|
428
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
429
|
+
});
|
|
378
430
|
}
|
|
379
431
|
}
|
|
380
432
|
return {
|
|
@@ -389,40 +441,51 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
389
441
|
};
|
|
390
442
|
}
|
|
391
443
|
/**
|
|
392
|
-
* Update a single document
|
|
444
|
+
* Update a single document.
|
|
445
|
+
* When content changes the document is re-chunked (old chunks removed, new ones inserted).
|
|
393
446
|
*/
|
|
394
447
|
async update(id, document, options) {
|
|
395
|
-
const
|
|
396
|
-
const update = { updatedAt: /* @__PURE__ */ new Date() };
|
|
448
|
+
const agentId = options?.agentId || "shared";
|
|
397
449
|
if (document.content) {
|
|
398
|
-
const
|
|
399
|
-
|
|
400
|
-
|
|
450
|
+
const fullDoc = {
|
|
451
|
+
id,
|
|
452
|
+
content: document.content,
|
|
453
|
+
metadata: document.metadata ?? { type: "content" }
|
|
454
|
+
};
|
|
455
|
+
await this.delete(id, options);
|
|
456
|
+
await this.ingest([fullDoc], options);
|
|
457
|
+
return;
|
|
401
458
|
}
|
|
459
|
+
const collection = await this.getCollection();
|
|
460
|
+
const metaUpdate = { updatedAt: /* @__PURE__ */ new Date() };
|
|
402
461
|
if (document.metadata) {
|
|
403
462
|
for (const [key, value] of Object.entries(document.metadata)) {
|
|
404
|
-
|
|
463
|
+
metaUpdate[`metadata.${key}`] = value;
|
|
405
464
|
}
|
|
406
465
|
}
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
466
|
+
await collection.updateMany(
|
|
467
|
+
{
|
|
468
|
+
tenantId: this.config.tenantId,
|
|
469
|
+
agentId,
|
|
470
|
+
$or: [{ id }, { documentId: id }]
|
|
471
|
+
},
|
|
472
|
+
{ $set: metaUpdate }
|
|
473
|
+
);
|
|
414
474
|
}
|
|
415
475
|
/**
|
|
416
|
-
* Delete document(s) by ID
|
|
476
|
+
* Delete document(s) by ID — also removes any chunks belonging to the document.
|
|
417
477
|
*/
|
|
418
478
|
async delete(ids, options) {
|
|
419
479
|
const collection = await this.getCollection();
|
|
420
480
|
const idArray = Array.isArray(ids) ? ids : [ids];
|
|
421
481
|
const filter = {
|
|
422
482
|
tenantId: this.config.tenantId,
|
|
423
|
-
|
|
424
|
-
// Match
|
|
425
|
-
|
|
483
|
+
agentId: options?.agentId || "shared",
|
|
484
|
+
// Match the document itself (id) OR any chunks that belong to it (documentId)
|
|
485
|
+
$or: [
|
|
486
|
+
{ id: { $in: idArray } },
|
|
487
|
+
{ documentId: { $in: idArray } }
|
|
488
|
+
]
|
|
426
489
|
};
|
|
427
490
|
const result = await collection.deleteMany(filter);
|
|
428
491
|
return result.deletedCount;
|
|
@@ -1560,6 +1623,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1560
1623
|
const content = this.extractBestContentText($, config);
|
|
1561
1624
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1562
1625
|
if (!content || content.length < minChars) return null;
|
|
1626
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || void 0;
|
|
1627
|
+
let imageUrl;
|
|
1628
|
+
if (image) {
|
|
1629
|
+
try {
|
|
1630
|
+
imageUrl = new URL(image, url).href;
|
|
1631
|
+
} catch {
|
|
1632
|
+
imageUrl = image;
|
|
1633
|
+
}
|
|
1634
|
+
}
|
|
1563
1635
|
let type = config.defaultType || "page";
|
|
1564
1636
|
if (config.typeFromUrl) {
|
|
1565
1637
|
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
@@ -1577,6 +1649,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1577
1649
|
type,
|
|
1578
1650
|
title,
|
|
1579
1651
|
url,
|
|
1652
|
+
...imageUrl ? { imageUrl } : {},
|
|
1580
1653
|
...config.metadata
|
|
1581
1654
|
}
|
|
1582
1655
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|