@toolpack-sdk/knowledge 1.3.0 → 2.0.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +323 -2
- package/dist/index.cjs +56 -9
- package/dist/index.d.cts +178 -1
- package/dist/index.d.ts +178 -1
- package/dist/index.js +56 -9
- package/package.json +11 -2
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# toolpack-knowledge
|
|
2
2
|
|
|
3
|
-
RAG (Retrieval-Augmented Generation) package for Toolpack SDK.
|
|
3
|
+
RAG (Retrieval-Augmented Generation) package for Toolpack SDK with advanced features for web crawling, API indexing, streaming ingestion, and hybrid search.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
@@ -54,6 +54,38 @@ const results = await kb.query('authentication setup', {
|
|
|
54
54
|
});
|
|
55
55
|
```
|
|
56
56
|
|
|
57
|
+
### Advanced Usage
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
import { Knowledge, WebUrlSource, ApiDataSource, PersistentKnowledgeProvider, OllamaEmbedder } from '@toolpack-sdk/knowledge';
|
|
61
|
+
|
|
62
|
+
// Web crawling + API indexing with hybrid search
|
|
63
|
+
const kb = await Knowledge.create({
|
|
64
|
+
provider: new PersistentKnowledgeProvider({ namespace: 'advanced-docs' }),
|
|
65
|
+
sources: [
|
|
66
|
+
new WebUrlSource(['https://docs.example.com'], {
|
|
67
|
+
maxDepth: 2,
|
|
68
|
+
delayMs: 1000,
|
|
69
|
+
}),
|
|
70
|
+
new ApiDataSource('https://api.example.com/docs', {
|
|
71
|
+
pagination: { param: 'page', start: 1, maxPages: 5 },
|
|
72
|
+
contentExtractor: (doc) => `${doc.title}\n\n${doc.content}`,
|
|
73
|
+
}),
|
|
74
|
+
],
|
|
75
|
+
embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }),
|
|
76
|
+
streamingBatchSize: 50, // Efficient processing of large datasets
|
|
77
|
+
description: 'Comprehensive documentation from web and API sources.',
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
// Hybrid search combining semantic and keyword matching
|
|
81
|
+
const results = await kb.query('authentication setup', {
|
|
82
|
+
searchType: 'hybrid',
|
|
83
|
+
semanticWeight: 0.6, // 60% semantic, 40% keyword
|
|
84
|
+
limit: 10,
|
|
85
|
+
threshold: 0.7,
|
|
86
|
+
});
|
|
87
|
+
```
|
|
88
|
+
|
|
57
89
|
### Agent Integration
|
|
58
90
|
|
|
59
91
|
```typescript
|
|
@@ -75,6 +107,128 @@ const toolpack = await Toolpack.init({
|
|
|
75
107
|
const response = await toolpack.chat('How do I configure authentication?');
|
|
76
108
|
```
|
|
77
109
|
|
|
110
|
+
## Advanced Features
|
|
111
|
+
|
|
112
|
+
### Web URL Sources
|
|
113
|
+
|
|
114
|
+
Crawl and index websites with automatic HTML parsing and link following.
|
|
115
|
+
|
|
116
|
+
```typescript
|
|
117
|
+
import { WebUrlSource } from '@toolpack-sdk/knowledge';
|
|
118
|
+
|
|
119
|
+
const webSource = new WebUrlSource(['https://docs.example.com'], {
|
|
120
|
+
maxDepth: 3, // Follow links up to 3 levels deep
|
|
121
|
+
delayMs: 1000, // Respectful crawling delay
|
|
122
|
+
userAgent: 'MyApp/1.0', // Custom user agent
|
|
123
|
+
maxChunkSize: 1500, // Chunk size for web content
|
|
124
|
+
timeoutMs: 30000, // Request timeout
|
|
125
|
+
sameDomainOnly: true, // Only follow links on the same domain (default: true)
|
|
126
|
+
maxPagesPerDomain: 20, // Cap pages per domain (default: 10)
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
const kb = await Knowledge.create({
|
|
130
|
+
provider: new MemoryProvider(),
|
|
131
|
+
sources: [webSource],
|
|
132
|
+
embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }),
|
|
133
|
+
description: 'Web documentation and guides.',
|
|
134
|
+
});
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**Features:**
|
|
138
|
+
- Recursive website crawling with depth control
|
|
139
|
+
- Automatic HTML text extraction (removes scripts/styles)
|
|
140
|
+
- Link discovery and following
|
|
141
|
+
- Respectful crawling with configurable delays
|
|
142
|
+
- Metadata includes title, URL, and source type
|
|
143
|
+
|
|
144
|
+
### API Data Sources
|
|
145
|
+
|
|
146
|
+
Index data from REST APIs with pagination support.
|
|
147
|
+
|
|
148
|
+
```typescript
|
|
149
|
+
import { ApiDataSource } from '@toolpack-sdk/knowledge';
|
|
150
|
+
|
|
151
|
+
const apiSource = new ApiDataSource('https://api.github.com/repos/toolpack-ai/toolpack-sdk/issues', {
|
|
152
|
+
headers: {
|
|
153
|
+
'Authorization': `Bearer ${process.env.GITHUB_TOKEN}`,
|
|
154
|
+
'Accept': 'application/vnd.github.v3+json',
|
|
155
|
+
},
|
|
156
|
+
pagination: {
|
|
157
|
+
param: 'page',
|
|
158
|
+
start: 1,
|
|
159
|
+
maxPages: 5,
|
|
160
|
+
},
|
|
161
|
+
dataPath: '', // Root level array
|
|
162
|
+
contentExtractor: (issue: any) => `${issue.title}\n\n${issue.body}`,
|
|
163
|
+
metadataExtractor: (issue: any) => ({
|
|
164
|
+
id: issue.id,
|
|
165
|
+
state: issue.state,
|
|
166
|
+
labels: issue.labels?.map(l => l.name),
|
|
167
|
+
}),
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
const kb = await Knowledge.create({
|
|
171
|
+
provider: new PersistentKnowledgeProvider({ namespace: 'github-issues' }),
|
|
172
|
+
sources: [apiSource],
|
|
173
|
+
embedder: new OpenAIEmbedder({ model: 'text-embedding-3-small' }),
|
|
174
|
+
description: 'GitHub issues and discussions.',
|
|
175
|
+
});
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**Features:**
|
|
179
|
+
- REST API data ingestion (GET/POST)
|
|
180
|
+
- Automatic pagination handling
|
|
181
|
+
- Custom content and metadata extractors
|
|
182
|
+
- JSON path support for nested data
|
|
183
|
+
- Flexible data transformation
|
|
184
|
+
|
|
185
|
+
### Streaming Ingestion
|
|
186
|
+
|
|
187
|
+
Process large datasets efficiently with batch processing.
|
|
188
|
+
|
|
189
|
+
```typescript
|
|
190
|
+
const kb = await Knowledge.create({
|
|
191
|
+
provider: new PersistentKnowledgeProvider({ namespace: 'large-dataset' }),
|
|
192
|
+
sources: [new ApiDataSource('https://api.example.com/large-dataset')],
|
|
193
|
+
embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }),
|
|
194
|
+
streamingBatchSize: 50, // Process 50 chunks at a time
|
|
195
|
+
description: 'Large dataset with streaming ingestion.',
|
|
196
|
+
onEmbeddingProgress: (event) => {
|
|
197
|
+
console.log(`Processed: ${event.current}/${event.total} chunks`);
|
|
198
|
+
},
|
|
199
|
+
});
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Hybrid Search
|
|
203
|
+
|
|
204
|
+
Combine semantic and keyword search for better results.
|
|
205
|
+
|
|
206
|
+
```typescript
|
|
207
|
+
// Semantic search (default)
|
|
208
|
+
const semanticResults = await kb.query('machine learning algorithms', {
|
|
209
|
+
searchType: 'semantic',
|
|
210
|
+
limit: 5,
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
// Keyword search
|
|
214
|
+
const keywordResults = await kb.query('machine learning algorithms', {
|
|
215
|
+
searchType: 'keyword',
|
|
216
|
+
limit: 5,
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
// Hybrid search (recommended)
|
|
220
|
+
const hybridResults = await kb.query('machine learning algorithms', {
|
|
221
|
+
searchType: 'hybrid',
|
|
222
|
+
semanticWeight: 0.7, // 70% semantic, 30% keyword
|
|
223
|
+
limit: 5,
|
|
224
|
+
});
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**Search Types:**
|
|
228
|
+
- `semantic` — Vector similarity search (default)
|
|
229
|
+
- `keyword` — Text matching search
|
|
230
|
+
- `hybrid` — Combined semantic + keyword search
|
|
231
|
+
|
|
78
232
|
## Providers
|
|
79
233
|
|
|
80
234
|
### MemoryProvider
|
|
@@ -121,6 +275,133 @@ new MarkdownSource('./docs/**/*.md', {
|
|
|
121
275
|
- Code block detection (`hasCode` metadata)
|
|
122
276
|
- Deterministic chunk IDs
|
|
123
277
|
|
|
278
|
+
### WebUrlSource
|
|
279
|
+
|
|
280
|
+
Crawl and index web pages with HTML parsing.
|
|
281
|
+
|
|
282
|
+
```typescript
|
|
283
|
+
new WebUrlSource(['https://example.com', 'https://docs.example.com'], {
|
|
284
|
+
maxDepth: 2, // Crawl depth (default: 1)
|
|
285
|
+
delayMs: 1000, // Delay between requests (default: 1000ms)
|
|
286
|
+
userAgent: 'MyApp/1.0', // Custom user agent
|
|
287
|
+
maxChunkSize: 2000, // Max tokens per chunk
|
|
288
|
+
chunkOverlap: 200, // Overlap between chunks
|
|
289
|
+
timeoutMs: 30000, // Request timeout (default: 30000ms)
|
|
290
|
+
sameDomainOnly: true, // Only follow links on the same domain (default: true)
|
|
291
|
+
maxPagesPerDomain: 10, // Max pages crawled per domain (default: 10)
|
|
292
|
+
namespace: 'web', // Chunk ID prefix
|
|
293
|
+
metadata: { source: 'web' }, // Added to all chunks
|
|
294
|
+
})
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
**Features:**
|
|
298
|
+
- Recursive website crawling
|
|
299
|
+
- Automatic HTML text extraction
|
|
300
|
+
- Link discovery and following
|
|
301
|
+
- Respectful crawling with delays
|
|
302
|
+
- Error handling for failed requests
|
|
303
|
+
|
|
304
|
+
### ApiDataSource
|
|
305
|
+
|
|
306
|
+
Index data from REST APIs with pagination.
|
|
307
|
+
|
|
308
|
+
```typescript
|
|
309
|
+
new ApiDataSource('https://api.example.com/data', {
|
|
310
|
+
method: 'GET', // HTTP method (default: 'GET')
|
|
311
|
+
headers: { // Request headers
|
|
312
|
+
'Authorization': 'Bearer token',
|
|
313
|
+
'Content-Type': 'application/json',
|
|
314
|
+
},
|
|
315
|
+
body: JSON.stringify({}), // Request body for POST
|
|
316
|
+
pagination: { // Pagination config
|
|
317
|
+
param: 'page', // Query param name
|
|
318
|
+
start: 1, // Starting page number
|
|
319
|
+
step: 1, // Page increment
|
|
320
|
+
maxPages: 10, // Max pages to fetch
|
|
321
|
+
},
|
|
322
|
+
dataPath: 'data.items', // JSON path to data array
|
|
323
|
+
contentExtractor: (item) => // Custom content extraction
|
|
324
|
+
`${item.title}\n\n${item.description}`,
|
|
325
|
+
metadataExtractor: (item) => ({ // Custom metadata extraction
|
|
326
|
+
id: item.id,
|
|
327
|
+
category: item.category,
|
|
328
|
+
}),
|
|
329
|
+
maxChunkSize: 2000, // Max tokens per chunk
|
|
330
|
+
chunkOverlap: 200, // Overlap between chunks
|
|
331
|
+
timeoutMs: 30000, // Request timeout
|
|
332
|
+
namespace: 'api', // Chunk ID prefix
|
|
333
|
+
metadata: { source: 'api' }, // Added to all chunks
|
|
334
|
+
})
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
**Features:**
|
|
338
|
+
- REST API data ingestion
|
|
339
|
+
- Automatic pagination handling
|
|
340
|
+
- Custom data extractors
|
|
341
|
+
- JSON path support
|
|
342
|
+
- Flexible content transformation
|
|
343
|
+
|
|
344
|
+
### JSONSource
|
|
345
|
+
|
|
346
|
+
Index data from local JSON files.
|
|
347
|
+
|
|
348
|
+
```typescript
|
|
349
|
+
import { JSONSource } from '@toolpack-sdk/knowledge';
|
|
350
|
+
|
|
351
|
+
new JSONSource('./data/products.json', {
|
|
352
|
+
toContent: (item: any) => `${item.name}\n\n${item.description}`, // Required
|
|
353
|
+
filter: (item: any) => item.active === true, // Optional: filter items
|
|
354
|
+
chunkSize: 100, // Items per chunk (default: 100)
|
|
355
|
+
namespace: 'products',
|
|
356
|
+
metadata: { source: 'products-db' },
|
|
357
|
+
})
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
**Features:**
|
|
361
|
+
- Parses JSON arrays (or single objects)
|
|
362
|
+
- Optional item-level filtering
|
|
363
|
+
- Required `toContent` callback to control what gets embedded
|
|
364
|
+
|
|
365
|
+
### SQLiteSource
|
|
366
|
+
|
|
367
|
+
Index rows from a SQLite database. Requires `better-sqlite3`.
|
|
368
|
+
|
|
369
|
+
```typescript
|
|
370
|
+
import { SQLiteSource } from '@toolpack-sdk/knowledge';
|
|
371
|
+
|
|
372
|
+
new SQLiteSource('./data/app.db', {
|
|
373
|
+
query: 'SELECT id, title, body FROM articles WHERE published = 1', // Optional: defaults to all rows
|
|
374
|
+
toContent: (row) => `${row.title}\n\n${row.body}`, // Required
|
|
375
|
+
chunkSize: 50, // Rows per chunk (default: 100)
|
|
376
|
+
namespace: 'articles',
|
|
377
|
+
metadata: { source: 'sqlite' },
|
|
378
|
+
preLoadCSV: { // Optional: load a CSV into the DB before querying
|
|
379
|
+
tableName: 'articles',
|
|
380
|
+
csvPath: './data/articles.csv',
|
|
381
|
+
delimiter: ',',
|
|
382
|
+
headers: true,
|
|
383
|
+
},
|
|
384
|
+
})
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
### PostgresSource
|
|
388
|
+
|
|
389
|
+
Index rows from a PostgreSQL database. Requires `pg`.
|
|
390
|
+
|
|
391
|
+
```typescript
|
|
392
|
+
import { PostgresSource } from '@toolpack-sdk/knowledge';
|
|
393
|
+
|
|
394
|
+
new PostgresSource({
|
|
395
|
+
connectionString: process.env.DATABASE_URL, // or use host/port/database/user/password
|
|
396
|
+
query: 'SELECT id, title, content FROM docs WHERE status = $1',
|
|
397
|
+
toContent: (row) => `${row.title}\n\n${row.content}`, // Required
|
|
398
|
+
chunkSize: 50,
|
|
399
|
+
namespace: 'docs',
|
|
400
|
+
metadata: { source: 'postgres' },
|
|
401
|
+
ssl: true,
|
|
402
|
+
})
|
|
403
|
+
```
|
|
404
|
+
|
|
124
405
|
## Embedders
|
|
125
406
|
|
|
126
407
|
### OllamaEmbedder
|
|
@@ -129,11 +410,34 @@ Local embeddings via Ollama. Zero API cost.
|
|
|
129
410
|
|
|
130
411
|
```typescript
|
|
131
412
|
new OllamaEmbedder({
|
|
132
|
-
model: 'nomic-embed-text', // or 'mxbai-embed-large'
|
|
413
|
+
model: 'nomic-embed-text', // or 'mxbai-embed-large', 'all-minilm', 'bge-m3', etc.
|
|
133
414
|
baseUrl: 'http://localhost:11434', // default
|
|
415
|
+
dimensions: 768, // optional: override auto-detected dimensions
|
|
416
|
+
retries: 3, // default
|
|
417
|
+
retryDelay: 1000, // ms, default
|
|
134
418
|
})
|
|
135
419
|
```
|
|
136
420
|
|
|
421
|
+
Known models: `nomic-embed-text` (768), `mxbai-embed-large` (1024), `all-minilm` (384), `snowflake-arctic-embed` (1024), `bge-m3` (1024), `bge-large` (1024). Pass `dimensions` for any other model.
|
|
422
|
+
|
|
423
|
+
### OpenRouterEmbedder
|
|
424
|
+
|
|
425
|
+
Embeddings via OpenRouter, giving access to OpenAI embedding models through a single API key.
|
|
426
|
+
|
|
427
|
+
```typescript
|
|
428
|
+
import { OpenRouterEmbedder } from '@toolpack-sdk/knowledge';
|
|
429
|
+
|
|
430
|
+
new OpenRouterEmbedder({
|
|
431
|
+
model: 'openai/text-embedding-3-small', // or 'openai/text-embedding-3-large', 'openai/text-embedding-ada-002'
|
|
432
|
+
apiKey: process.env.OPENROUTER_API_KEY!,
|
|
433
|
+
dimensions: 1536, // optional: override auto-detected dimensions
|
|
434
|
+
retries: 3, // default
|
|
435
|
+
retryDelay: 1000, // ms, default
|
|
436
|
+
})
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
Known models: `openai/text-embedding-3-small` (1536), `openai/text-embedding-3-large` (3072), `openai/text-embedding-ada-002` (1536). Pass `dimensions` for any other model.
|
|
440
|
+
|
|
137
441
|
### OpenAIEmbedder
|
|
138
442
|
|
|
139
443
|
OpenAI text-embedding models with retry logic.
|
|
@@ -159,6 +463,7 @@ interface KnowledgeOptions {
|
|
|
159
463
|
embedder: Embedder;
|
|
160
464
|
description: string; // Required: used as tool description
|
|
161
465
|
reSync?: boolean; // default: true
|
|
466
|
+
streamingBatchSize?: number; // Process chunks in batches (default: 100)
|
|
162
467
|
onError?: (error, context) => 'skip' | 'abort';
|
|
163
468
|
onSync?: (event: SyncEvent) => void;
|
|
164
469
|
onEmbeddingProgress?: (event: EmbeddingProgressEvent) => void;
|
|
@@ -171,6 +476,8 @@ interface KnowledgeOptions {
|
|
|
171
476
|
await kb.query('search query', {
|
|
172
477
|
limit: 10, // Max results
|
|
173
478
|
threshold: 0.7, // Similarity threshold (0-1)
|
|
479
|
+
searchType: 'hybrid', // 'semantic' | 'keyword' | 'hybrid' (default: 'semantic')
|
|
480
|
+
semanticWeight: 0.7, // Weight for semantic vs keyword in hybrid search (0-1)
|
|
174
481
|
filter: { // Metadata filters
|
|
175
482
|
hasCode: true,
|
|
176
483
|
category: { $in: ['api', 'guide'] },
|
|
@@ -180,6 +487,20 @@ await kb.query('search query', {
|
|
|
180
487
|
});
|
|
181
488
|
```
|
|
182
489
|
|
|
490
|
+
### Utility Functions
|
|
491
|
+
|
|
492
|
+
```typescript
|
|
493
|
+
import { keywordSearch, combineScores } from '@toolpack-sdk/knowledge';
|
|
494
|
+
|
|
495
|
+
// Manual keyword search
|
|
496
|
+
const score = keywordSearch('document content', 'search query');
|
|
497
|
+
// Returns: number between 0-1
|
|
498
|
+
|
|
499
|
+
// Combine semantic and keyword scores
|
|
500
|
+
const combinedScore = combineScores(semanticScore, keywordScore, 0.7);
|
|
501
|
+
// Returns: weighted combination
|
|
502
|
+
```
|
|
503
|
+
|
|
183
504
|
### Metadata Filters
|
|
184
505
|
|
|
185
506
|
```typescript
|