@snap-agent/rag-web 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +571 -0
- package/dist/index.d.mts +727 -0
- package/dist/index.d.ts +727 -0
- package/dist/index.js +2144 -0
- package/dist/index.mjs +2107 -0
- package/package.json +71 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 ViloTech
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
package/README.md
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
1
|
+
# @snap-agent/rag-web
|
|
2
|
+
|
|
3
|
+
Schema-agnostic Web RAG plugin for SnapAgent SDK. Build chatbots for any website content — Drupal, WordPress, Contentful, or custom CMS.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Schema-Agnostic** - Only `id`, `content`, and `type` are required; store any metadata you need
|
|
8
|
+
- **Multi-CMS Support** - Works with Drupal JSON:API, WordPress REST, Contentful, or any JSON/CSV/XML source
|
|
9
|
+
- **Flexible Filtering** - Filter by any metadata field (type, category, author, tags, etc.)
|
|
10
|
+
- **Type Boosts** - Prioritize certain content types in search results
|
|
11
|
+
- **Recency Boost** - Automatically boost fresh content (great for news/blog)
|
|
12
|
+
- **URL Ingestion** - Fetch content directly from APIs with authentication
|
|
13
|
+
- **Drupal Integration** - Built-in helpers for Drupal JSON:API
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
npm install @snap-agent/rag-web @snap-agent/core mongodb
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```typescript
|
|
24
|
+
import { createClient, MemoryStorage } from '@snap-agent/core';
|
|
25
|
+
import { WebRAGPlugin } from '@snap-agent/rag-web';
|
|
26
|
+
|
|
27
|
+
const cmsPlugin = new WebRAGPlugin({
|
|
28
|
+
mongoUri: process.env.MONGODB_URI!,
|
|
29
|
+
dbName: 'my_website',
|
|
30
|
+
openaiApiKey: process.env.OPENAI_API_KEY!,
|
|
31
|
+
tenantId: 'my-company',
|
|
32
|
+
filterableFields: ['type', 'category', 'author'],
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const client = createClient({
|
|
36
|
+
storage: new MemoryStorage(),
|
|
37
|
+
providers: {
|
|
38
|
+
openai: { apiKey: process.env.OPENAI_API_KEY! },
|
|
39
|
+
},
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
const agent = await client.createAgent({
|
|
43
|
+
name: 'Website Assistant',
|
|
44
|
+
instructions: 'Help visitors find information on our website.',
|
|
45
|
+
model: 'gpt-4o',
|
|
46
|
+
userId: 'system',
|
|
47
|
+
plugins: [cmsPlugin],
|
|
48
|
+
});
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Content Structure
|
|
52
|
+
|
|
53
|
+
Only three fields are required:
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
interface WebDocument {
|
|
57
|
+
id: string; // Unique identifier
|
|
58
|
+
content: string; // Text to embed and search
|
|
59
|
+
metadata: {
|
|
60
|
+
type: string; // Content type (e.g., 'blog', 'project', 'team')
|
|
61
|
+
title?: string; // Optional: Display title
|
|
62
|
+
url?: string; // Optional: Source URL
|
|
63
|
+
[key: string]: any; // Any other fields you need!
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Example: Architecture Firm Website
|
|
69
|
+
|
|
70
|
+
```typescript
|
|
71
|
+
// Projects
|
|
72
|
+
await agent.ingestDocuments([{
|
|
73
|
+
id: 'project-123',
|
|
74
|
+
content: 'The Sahara West Library is a 65,000 SF public library featuring sustainable design...',
|
|
75
|
+
metadata: {
|
|
76
|
+
type: 'project',
|
|
77
|
+
title: 'Sahara West Library',
|
|
78
|
+
url: '/projects/sahara-west-library',
|
|
79
|
+
location: 'Las Vegas, NV',
|
|
80
|
+
sector: 'Cultural',
|
|
81
|
+
services: ['Architecture', 'Interior Design'],
|
|
82
|
+
completionYear: 2018,
|
|
83
|
+
featured: true,
|
|
84
|
+
}
|
|
85
|
+
}]);
|
|
86
|
+
|
|
87
|
+
// Team Members
|
|
88
|
+
await agent.ingestDocuments([{
|
|
89
|
+
id: 'team-456',
|
|
90
|
+
content: 'Jane Smith is a Principal and leads the Healthcare practice...',
|
|
91
|
+
metadata: {
|
|
92
|
+
type: 'team',
|
|
93
|
+
title: 'Jane Smith',
|
|
94
|
+
url: '/people/jane-smith',
|
|
95
|
+
role: 'Principal',
|
|
96
|
+
location: 'Phoenix',
|
|
97
|
+
sectors: ['Healthcare', 'Science & Technology'],
|
|
98
|
+
}
|
|
99
|
+
}]);
|
|
100
|
+
|
|
101
|
+
// News/Perspectives
|
|
102
|
+
await agent.ingestDocuments([{
|
|
103
|
+
id: 'perspective-789',
|
|
104
|
+
content: 'Biophilic design connects building occupants to nature...',
|
|
105
|
+
metadata: {
|
|
106
|
+
type: 'perspective',
|
|
107
|
+
title: 'The Science of Biophilic Design',
|
|
108
|
+
url: '/perspectives/biophilic-design',
|
|
109
|
+
author: 'Jane Smith',
|
|
110
|
+
publishedAt: '2024-01-15',
|
|
111
|
+
tags: ['Sustainability', 'Wellness'],
|
|
112
|
+
}
|
|
113
|
+
}]);
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## CMS Integrations
|
|
117
|
+
|
|
118
|
+
Built-in helpers for popular CMS platforms:
|
|
119
|
+
|
|
120
|
+
### Drupal (JSON:API)
|
|
121
|
+
|
|
122
|
+
```typescript
|
|
123
|
+
await cmsPlugin.ingestFromDrupal({
|
|
124
|
+
baseUrl: 'https://example-architecture.com',
|
|
125
|
+
contentTypes: ['project', 'perspective', 'team_member', 'news'],
|
|
126
|
+
auth: {
|
|
127
|
+
type: 'bearer',
|
|
128
|
+
token: process.env.DRUPAL_API_TOKEN,
|
|
129
|
+
},
|
|
130
|
+
mappings: {
|
|
131
|
+
project: {
|
|
132
|
+
content: 'attributes.body.processed',
|
|
133
|
+
fields: {
|
|
134
|
+
location: 'attributes.field_location',
|
|
135
|
+
sector: 'attributes.field_sector.name',
|
|
136
|
+
services: 'attributes.field_services',
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
team_member: {
|
|
140
|
+
content: 'attributes.field_bio.processed',
|
|
141
|
+
fields: {
|
|
142
|
+
role: 'attributes.field_title',
|
|
143
|
+
sectors: 'attributes.field_sectors',
|
|
144
|
+
},
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### WordPress (REST API)
|
|
151
|
+
|
|
152
|
+
```typescript
|
|
153
|
+
await cmsPlugin.ingestFromWordPress({
|
|
154
|
+
baseUrl: 'https://myblog.com',
|
|
155
|
+
postTypes: ['posts', 'pages', 'portfolio'], // Default: ['posts', 'pages']
|
|
156
|
+
perPage: 100, // Items per request
|
|
157
|
+
maxPages: 10, // Max pages to fetch
|
|
158
|
+
auth: {
|
|
159
|
+
type: 'basic',
|
|
160
|
+
username: process.env.WP_USER,
|
|
161
|
+
password: process.env.WP_APP_PASSWORD,
|
|
162
|
+
},
|
|
163
|
+
mappings: {
|
|
164
|
+
portfolio: {
|
|
165
|
+
content: 'content.rendered',
|
|
166
|
+
fields: {
|
|
167
|
+
client: 'acf.client_name', // ACF custom fields
|
|
168
|
+
industry: 'acf.industry',
|
|
169
|
+
},
|
|
170
|
+
},
|
|
171
|
+
},
|
|
172
|
+
});
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Features:**
|
|
176
|
+
- Automatic pagination handling
|
|
177
|
+
- Embedded data (`_embed`) for authors, categories, featured images
|
|
178
|
+
- ACF (Advanced Custom Fields) support via custom mappings
|
|
179
|
+
- Custom post types
|
|
180
|
+
|
|
181
|
+
### Sanity.io (GROQ)
|
|
182
|
+
|
|
183
|
+
```typescript
|
|
184
|
+
await cmsPlugin.ingestFromSanity({
|
|
185
|
+
projectId: 'abc123',
|
|
186
|
+
dataset: 'production',
|
|
187
|
+
apiVersion: 'v2024-01-01', // Optional
|
|
188
|
+
token: process.env.SANITY_TOKEN, // For private datasets
|
|
189
|
+
useCdn: true, // Default: true (faster reads)
|
|
190
|
+
queries: {
|
|
191
|
+
post: {
|
|
192
|
+
query: '*[_type == "post" && !(_id in path("drafts.**"))]',
|
|
193
|
+
content: 'body',
|
|
194
|
+
fields: {
|
|
195
|
+
author: 'author->name',
|
|
196
|
+
categories: 'categories[]->title',
|
|
197
|
+
mainImage: 'mainImage.asset->url',
|
|
198
|
+
},
|
|
199
|
+
},
|
|
200
|
+
page: {
|
|
201
|
+
query: '*[_type == "page"]',
|
|
202
|
+
content: 'content',
|
|
203
|
+
},
|
|
204
|
+
},
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
// Convert Portable Text to plain text
|
|
208
|
+
const plainText = WebRAGPlugin.sanityBlocksToText(portableTextBlocks);
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
**Features:**
|
|
212
|
+
- GROQ query support for complex filtering
|
|
213
|
+
- Reference expansion (`->` operator)
|
|
214
|
+
- Portable Text to plain text conversion
|
|
215
|
+
- CDN and API endpoint support
|
|
216
|
+
|
|
217
|
+
### Strapi (v3 & v4)
|
|
218
|
+
|
|
219
|
+
```typescript
|
|
220
|
+
// Strapi v4 (default)
|
|
221
|
+
await cmsPlugin.ingestFromStrapi({
|
|
222
|
+
baseUrl: 'https://my-strapi.com',
|
|
223
|
+
apiToken: process.env.STRAPI_TOKEN,
|
|
224
|
+
contentTypes: ['articles', 'pages', 'projects'],
|
|
225
|
+
pageSize: 100,
|
|
226
|
+
maxPages: 10,
|
|
227
|
+
mappings: {
|
|
228
|
+
articles: {
|
|
229
|
+
content: 'attributes.content',
|
|
230
|
+
fields: {
|
|
231
|
+
author: 'attributes.author.data.attributes.name',
|
|
232
|
+
category: 'attributes.category.data.attributes.name',
|
|
233
|
+
featuredImage: 'attributes.cover.data.attributes.url',
|
|
234
|
+
},
|
|
235
|
+
},
|
|
236
|
+
},
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
// Strapi v3 (set useAttributes: false)
|
|
240
|
+
await cmsPlugin.ingestFromStrapi({
|
|
241
|
+
baseUrl: 'https://my-strapi-v3.com',
|
|
242
|
+
contentTypes: ['articles'],
|
|
243
|
+
mappings: {
|
|
244
|
+
articles: {
|
|
245
|
+
content: 'content',
|
|
246
|
+
useAttributes: false, // Strapi v3 uses flat structure
|
|
247
|
+
fields: {
|
|
248
|
+
author: 'author.name',
|
|
249
|
+
},
|
|
250
|
+
},
|
|
251
|
+
},
|
|
252
|
+
});
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
**Features:**
|
|
256
|
+
- Strapi v3 and v4 support
|
|
257
|
+
- Automatic pagination
|
|
258
|
+
- Relation population (`populate=*`)
|
|
259
|
+
- Media/image URL extraction
|
|
260
|
+
|
|
261
|
+
## Zero-Setup Web Crawling
|
|
262
|
+
|
|
263
|
+
For non-technical clients who can't set up API access, use built-in web crawling:
|
|
264
|
+
|
|
265
|
+
### Sitemap Crawling
|
|
266
|
+
|
|
267
|
+
Just provide the sitemap URL — works with any website:
|
|
268
|
+
|
|
269
|
+
```typescript
|
|
270
|
+
// Simple - just the sitemap URL
|
|
271
|
+
await cmsPlugin.ingestFromSitemap({
|
|
272
|
+
sitemapUrl: 'https://example.com/sitemap.xml',
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
// Or auto-discover sitemap from base URL
|
|
276
|
+
await cmsPlugin.ingestFromSitemap({
|
|
277
|
+
baseUrl: 'https://example.com',
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
// Advanced - with content selectors and type inference
|
|
281
|
+
await cmsPlugin.ingestFromSitemap({
|
|
282
|
+
sitemapUrl: 'https://example.com/sitemap.xml',
|
|
283
|
+
maxPages: 500,
|
|
284
|
+
concurrency: 3, // Parallel requests
|
|
285
|
+
delayMs: 500, // Delay between requests
|
|
286
|
+
|
|
287
|
+
// Content extraction
|
|
288
|
+
contentSelector: 'article, .main-content, main',
|
|
289
|
+
removeSelectors: ['nav', 'footer', '.sidebar', '.comments'],
|
|
290
|
+
|
|
291
|
+
// URL filtering
|
|
292
|
+
excludePatterns: ['/cart', '/checkout', '/admin', '/login'],
|
|
293
|
+
includePatterns: ['/blog/', '/projects/', '/people/'],
|
|
294
|
+
|
|
295
|
+
// Infer type from URL path
|
|
296
|
+
typeFromUrl: {
|
|
297
|
+
'/projects/': 'project',
|
|
298
|
+
'/perspectives/': 'blog',
|
|
299
|
+
'/people/': 'team',
|
|
300
|
+
'/news/': 'news',
|
|
301
|
+
},
|
|
302
|
+
});
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
**Features:**
|
|
306
|
+
- Auto-discovers sitemap from base URL
|
|
307
|
+
- Handles sitemap index files (nested sitemaps)
|
|
308
|
+
- Smart content extraction using CSS selectors
|
|
309
|
+
- URL pattern filtering (include/exclude)
|
|
310
|
+
- Content type inference from URL
|
|
311
|
+
- Rate limiting (concurrency + delay)
|
|
312
|
+
- Removes navigation, footers, sidebars
|
|
313
|
+
|
|
314
|
+
### Direct URL Crawling
|
|
315
|
+
|
|
316
|
+
Crawl specific pages:
|
|
317
|
+
|
|
318
|
+
```typescript
|
|
319
|
+
await cmsPlugin.ingestFromUrls([
|
|
320
|
+
'https://example.com/about',
|
|
321
|
+
'https://example.com/services',
|
|
322
|
+
'https://example.com/contact',
|
|
323
|
+
'https://example.com/pricing',
|
|
324
|
+
], {
|
|
325
|
+
type: 'page',
|
|
326
|
+
contentSelector: '.page-content',
|
|
327
|
+
concurrency: 2,
|
|
328
|
+
});
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
### RSS/Atom Feeds
|
|
332
|
+
|
|
333
|
+
Ingest blog posts from RSS feeds:
|
|
334
|
+
|
|
335
|
+
```typescript
|
|
336
|
+
// Simple RSS ingestion
|
|
337
|
+
await cmsPlugin.ingestFromRSS({
|
|
338
|
+
feedUrl: 'https://myblog.com/feed/',
|
|
339
|
+
type: 'post',
|
|
340
|
+
});
|
|
341
|
+
|
|
342
|
+
// Fetch full article content (not just excerpt)
|
|
343
|
+
await cmsPlugin.ingestFromRSS({
|
|
344
|
+
feedUrl: 'https://myblog.com/feed/',
|
|
345
|
+
fetchFullContent: true, // Crawl each article page
|
|
346
|
+
contentSelector: 'article',
|
|
347
|
+
});
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
**Supported formats:**
|
|
351
|
+
- RSS 2.0
|
|
352
|
+
- RSS 1.0
|
|
353
|
+
- Atom
|
|
354
|
+
|
|
355
|
+
## URL Ingestion
|
|
356
|
+
|
|
357
|
+
Ingest from any JSON, CSV, or XML endpoint:
|
|
358
|
+
|
|
359
|
+
```typescript
|
|
360
|
+
// JSON API
|
|
361
|
+
await cmsPlugin.ingestFromUrl({
|
|
362
|
+
url: 'https://api.example.com/posts',
|
|
363
|
+
type: 'json',
|
|
364
|
+
transform: {
|
|
365
|
+
documentPath: 'data.posts', // JSONPath to array
|
|
366
|
+
fieldMapping: {
|
|
367
|
+
id: 'post_id',
|
|
368
|
+
content: 'body_html',
|
|
369
|
+
type: () => 'blog', // Static value
|
|
370
|
+
title: 'title',
|
|
371
|
+
author: 'author.name',
|
|
372
|
+
publishedAt: 'created_at',
|
|
373
|
+
},
|
|
374
|
+
},
|
|
375
|
+
auth: {
|
|
376
|
+
type: 'bearer',
|
|
377
|
+
token: process.env.API_TOKEN,
|
|
378
|
+
},
|
|
379
|
+
});
|
|
380
|
+
|
|
381
|
+
// CSV file
|
|
382
|
+
await cmsPlugin.ingestFromUrl({
|
|
383
|
+
url: 'https://example.com/content.csv',
|
|
384
|
+
type: 'csv',
|
|
385
|
+
transform: {
|
|
386
|
+
fieldMapping: {
|
|
387
|
+
id: 'ID',
|
|
388
|
+
content: 'Description',
|
|
389
|
+
type: 'ContentType',
|
|
390
|
+
title: 'Title',
|
|
391
|
+
},
|
|
392
|
+
},
|
|
393
|
+
});
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
## Configuration
|
|
397
|
+
|
|
398
|
+
```typescript
|
|
399
|
+
const cmsPlugin = new WebRAGPlugin({
|
|
400
|
+
// Required
|
|
401
|
+
mongoUri: process.env.MONGODB_URI!,
|
|
402
|
+
dbName: 'my_website',
|
|
403
|
+
openaiApiKey: process.env.OPENAI_API_KEY!,
|
|
404
|
+
tenantId: 'my-company',
|
|
405
|
+
|
|
406
|
+
// Collection name (default: 'web_content')
|
|
407
|
+
collection: 'website_content',
|
|
408
|
+
|
|
409
|
+
// Embedding model (default: 'text-embedding-3-small')
|
|
410
|
+
embeddingModel: 'text-embedding-3-large',
|
|
411
|
+
|
|
412
|
+
// Search settings
|
|
413
|
+
vectorIndexName: 'content_vector_index',
|
|
414
|
+
numCandidates: 100,
|
|
415
|
+
limit: 10,
|
|
416
|
+
minScore: 0.7,
|
|
417
|
+
|
|
418
|
+
// Filterable fields for MongoDB indexing
|
|
419
|
+
filterableFields: ['type', 'category', 'author', 'sector', 'tags'],
|
|
420
|
+
|
|
421
|
+
// Boost certain content types
|
|
422
|
+
typeBoosts: {
|
|
423
|
+
project: 1.2, // Projects rank higher
|
|
424
|
+
news: 0.8, // News ranks lower
|
|
425
|
+
faq: 1.5, // FAQs rank highest
|
|
426
|
+
},
|
|
427
|
+
|
|
428
|
+
// Boost recent content
|
|
429
|
+
recencyBoost: {
|
|
430
|
+
enabled: true,
|
|
431
|
+
field: 'publishedAt',
|
|
432
|
+
decayDays: 90, // Content older than 90 days gets less boost
|
|
433
|
+
maxBoost: 1.3,
|
|
434
|
+
},
|
|
435
|
+
|
|
436
|
+
// Caching
|
|
437
|
+
cache: {
|
|
438
|
+
embeddings: {
|
|
439
|
+
enabled: true,
|
|
440
|
+
ttl: 3600000, // 1 hour
|
|
441
|
+
maxSize: 1000,
|
|
442
|
+
},
|
|
443
|
+
},
|
|
444
|
+
|
|
445
|
+
// Plugin priority
|
|
446
|
+
priority: 100,
|
|
447
|
+
});
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
## Filtering in Queries
|
|
451
|
+
|
|
452
|
+
Filter by any metadata field during retrieval:
|
|
453
|
+
|
|
454
|
+
```typescript
|
|
455
|
+
// Get only projects
|
|
456
|
+
const response = await client.chat({
|
|
457
|
+
threadId: thread.id,
|
|
458
|
+
message: 'Show me healthcare projects in Phoenix',
|
|
459
|
+
useRAG: true,
|
|
460
|
+
ragFilters: {
|
|
461
|
+
type: 'project',
|
|
462
|
+
sector: 'Healthcare',
|
|
463
|
+
},
|
|
464
|
+
});
|
|
465
|
+
|
|
466
|
+
// Get recent blog posts
|
|
467
|
+
const response = await client.chat({
|
|
468
|
+
threadId: thread.id,
|
|
469
|
+
message: 'Latest articles about sustainability',
|
|
470
|
+
useRAG: true,
|
|
471
|
+
ragFilters: {
|
|
472
|
+
type: { $in: ['blog', 'perspective', 'news'] },
|
|
473
|
+
},
|
|
474
|
+
});
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
## Multi-Agent Setup
|
|
478
|
+
|
|
479
|
+
Share content across agents or isolate by agent:
|
|
480
|
+
|
|
481
|
+
```typescript
|
|
482
|
+
// Shared content (available to all agents)
|
|
483
|
+
await cmsPlugin.ingest(documents);
|
|
484
|
+
|
|
485
|
+
// Agent-specific content
|
|
486
|
+
await cmsPlugin.ingest(documents, { agentId: 'sales-bot' });
|
|
487
|
+
await cmsPlugin.ingest(documents, { agentId: 'support-bot' });
|
|
488
|
+
```
|
|
489
|
+
|
|
490
|
+
## MongoDB Index Setup
|
|
491
|
+
|
|
492
|
+
Create a vector search index in MongoDB Atlas:
|
|
493
|
+
|
|
494
|
+
```json
|
|
495
|
+
{
|
|
496
|
+
"name": "web_vector_index",
|
|
497
|
+
"type": "vectorSearch",
|
|
498
|
+
"definition": {
|
|
499
|
+
"fields": [
|
|
500
|
+
{
|
|
501
|
+
"type": "vector",
|
|
502
|
+
"path": "embedding",
|
|
503
|
+
"numDimensions": 1536,
|
|
504
|
+
"similarity": "cosine"
|
|
505
|
+
},
|
|
506
|
+
{
|
|
507
|
+
"type": "filter",
|
|
508
|
+
"path": "tenantId"
|
|
509
|
+
},
|
|
510
|
+
{
|
|
511
|
+
"type": "filter",
|
|
512
|
+
"path": "metadata.type"
|
|
513
|
+
},
|
|
514
|
+
{
|
|
515
|
+
"type": "filter",
|
|
516
|
+
"path": "agentId"
|
|
517
|
+
}
|
|
518
|
+
]
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
```
|
|
522
|
+
|
|
523
|
+
## API Reference
|
|
524
|
+
|
|
525
|
+
### Core Methods
|
|
526
|
+
|
|
527
|
+
| Method | Description |
|
|
528
|
+
|--------|-------------|
|
|
529
|
+
| `ingest(documents, options?)` | Ingest documents into the RAG system |
|
|
530
|
+
| `update(id, document, options?)` | Update a single document |
|
|
531
|
+
| `delete(ids, options?)` | Delete document(s) by ID |
|
|
532
|
+
| `bulk(operations, options?)` | Perform bulk insert/update/delete operations |
|
|
533
|
+
| `retrieveContext(message, options?)` | Retrieve relevant content (called by SDK) |
|
|
534
|
+
|
|
535
|
+
### CMS Integrations
|
|
536
|
+
|
|
537
|
+
| Method | CMS |
|
|
538
|
+
|--------|-----|
|
|
539
|
+
| `ingestFromDrupal(config, options?)` | Drupal JSON:API |
|
|
540
|
+
| `ingestFromWordPress(config, options?)` | WordPress REST API |
|
|
541
|
+
| `ingestFromSanity(config, options?)` | Sanity.io GROQ |
|
|
542
|
+
| `ingestFromStrapi(config, options?)` | Strapi v3 & v4 |
|
|
543
|
+
|
|
544
|
+
### Web Crawling (Zero Setup)
|
|
545
|
+
|
|
546
|
+
| Method | Description |
|
|
547
|
+
|--------|-------------|
|
|
548
|
+
| `ingestFromSitemap(config, options?)` | Crawl pages from sitemap.xml |
|
|
549
|
+
| `ingestFromUrls(urls, config?, options?)` | Crawl specific URLs |
|
|
550
|
+
| `ingestFromRSS(config, options?)` | Ingest from RSS/Atom feeds |
|
|
551
|
+
|
|
552
|
+
### URL Ingestion
|
|
553
|
+
|
|
554
|
+
| Method | Description |
|
|
555
|
+
|--------|-------------|
|
|
556
|
+
| `ingestFromUrl(source, options?)` | Ingest from JSON, CSV, or XML endpoint |
|
|
557
|
+
|
|
558
|
+
### Utilities
|
|
559
|
+
|
|
560
|
+
| Method | Description |
|
|
561
|
+
|--------|-------------|
|
|
562
|
+
| `getCacheStats()` | Get embedding cache statistics |
|
|
563
|
+
| `clearCache()` | Clear the embedding cache |
|
|
564
|
+
| `disconnect()` | Close MongoDB connection |
|
|
565
|
+
| `WebRAGPlugin.sanityBlocksToText(blocks)` | Convert Portable Text to plain text |
|
|
566
|
+
| `WebRAGPlugin.parseDrupalType(type)` | Parse Drupal node type |
|
|
567
|
+
|
|
568
|
+
## License
|
|
569
|
+
|
|
570
|
+
MIT
|
|
571
|
+
|