@toolpack-sdk/knowledge 1.3.0 → 2.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # toolpack-knowledge
2
2
 
3
- RAG (Retrieval-Augmented Generation) package for Toolpack SDK.
3
+ RAG (Retrieval-Augmented Generation) package for Toolpack SDK with advanced features for web crawling, API indexing, streaming ingestion, and hybrid search.
4
4
 
5
5
  ## Installation
6
6
 
@@ -54,6 +54,38 @@ const results = await kb.query('authentication setup', {
54
54
  });
55
55
  ```
56
56
 
57
+ ### Advanced Usage
58
+
59
+ ```typescript
60
+ import { Knowledge, WebUrlSource, ApiDataSource, PersistentKnowledgeProvider, OllamaEmbedder } from '@toolpack-sdk/knowledge';
61
+
62
+ // Web crawling + API indexing with hybrid search
63
+ const kb = await Knowledge.create({
64
+ provider: new PersistentKnowledgeProvider({ namespace: 'advanced-docs' }),
65
+ sources: [
66
+ new WebUrlSource(['https://docs.example.com'], {
67
+ maxDepth: 2,
68
+ delayMs: 1000,
69
+ }),
70
+ new ApiDataSource('https://api.example.com/docs', {
71
+ pagination: { param: 'page', start: 1, maxPages: 5 },
72
+ contentExtractor: (doc) => `${doc.title}\n\n${doc.content}`,
73
+ }),
74
+ ],
75
+ embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }),
76
+ streamingBatchSize: 50, // Efficient processing of large datasets
77
+ description: 'Comprehensive documentation from web and API sources.',
78
+ });
79
+
80
+ // Hybrid search combining semantic and keyword matching
81
+ const results = await kb.query('authentication setup', {
82
+ searchType: 'hybrid',
83
+ semanticWeight: 0.6, // 60% semantic, 40% keyword
84
+ limit: 10,
85
+ threshold: 0.7,
86
+ });
87
+ ```
88
+
57
89
  ### Agent Integration
58
90
 
59
91
  ```typescript
@@ -75,6 +107,128 @@ const toolpack = await Toolpack.init({
75
107
  const response = await toolpack.chat('How do I configure authentication?');
76
108
  ```
77
109
 
110
+ ## Advanced Features
111
+
112
+ ### Web URL Sources
113
+
114
+ Crawl and index websites with automatic HTML parsing and link following.
115
+
116
+ ```typescript
117
+ import { WebUrlSource } from '@toolpack-sdk/knowledge';
118
+
119
+ const webSource = new WebUrlSource(['https://docs.example.com'], {
120
+ maxDepth: 3, // Follow links up to 3 levels deep
121
+ delayMs: 1000, // Respectful crawling delay
122
+ userAgent: 'MyApp/1.0', // Custom user agent
123
+ maxChunkSize: 1500, // Chunk size for web content
124
+ timeoutMs: 30000, // Request timeout
125
+ sameDomainOnly: true, // Only follow links on the same domain (default: true)
126
+ maxPagesPerDomain: 20, // Cap pages per domain (default: 10)
127
+ });
128
+
129
+ const kb = await Knowledge.create({
130
+ provider: new MemoryProvider(),
131
+ sources: [webSource],
132
+ embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }),
133
+ description: 'Web documentation and guides.',
134
+ });
135
+ ```
136
+
137
+ **Features:**
138
+ - Recursive website crawling with depth control
139
+ - Automatic HTML text extraction (removes scripts/styles)
140
+ - Link discovery and following
141
+ - Respectful crawling with configurable delays
142
+ - Metadata includes title, URL, and source type
143
+
144
+ ### API Data Sources
145
+
146
+ Index data from REST APIs with pagination support.
147
+
148
+ ```typescript
149
+ import { ApiDataSource } from '@toolpack-sdk/knowledge';
150
+
151
+ const apiSource = new ApiDataSource('https://api.github.com/repos/toolpack-ai/toolpack-sdk/issues', {
152
+ headers: {
153
+ 'Authorization': `Bearer ${process.env.GITHUB_TOKEN}`,
154
+ 'Accept': 'application/vnd.github.v3+json',
155
+ },
156
+ pagination: {
157
+ param: 'page',
158
+ start: 1,
159
+ maxPages: 5,
160
+ },
161
+ dataPath: '', // Root level array
162
+ contentExtractor: (issue: any) => `${issue.title}\n\n${issue.body}`,
163
+ metadataExtractor: (issue: any) => ({
164
+ id: issue.id,
165
+ state: issue.state,
166
+ labels: issue.labels?.map(l => l.name),
167
+ }),
168
+ });
169
+
170
+ const kb = await Knowledge.create({
171
+ provider: new PersistentKnowledgeProvider({ namespace: 'github-issues' }),
172
+ sources: [apiSource],
173
+ embedder: new OpenAIEmbedder({ model: 'text-embedding-3-small' }),
174
+ description: 'GitHub issues and discussions.',
175
+ });
176
+ ```
177
+
178
+ **Features:**
179
+ - REST API data ingestion (GET/POST)
180
+ - Automatic pagination handling
181
+ - Custom content and metadata extractors
182
+ - JSON path support for nested data
183
+ - Flexible data transformation
184
+
185
+ ### Streaming Ingestion
186
+
187
+ Process large datasets efficiently with batch processing.
188
+
189
+ ```typescript
190
+ const kb = await Knowledge.create({
191
+ provider: new PersistentKnowledgeProvider({ namespace: 'large-dataset' }),
192
+ sources: [new ApiDataSource('https://api.example.com/large-dataset')],
193
+ embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }),
194
+ streamingBatchSize: 50, // Process 50 chunks at a time
195
+ description: 'Large dataset with streaming ingestion.',
196
+ onEmbeddingProgress: (event) => {
197
+ console.log(`Processed: ${event.current}/${event.total} chunks`);
198
+ },
199
+ });
200
+ ```
201
+
202
+ ### Hybrid Search
203
+
204
+ Combine semantic and keyword search for better results.
205
+
206
+ ```typescript
207
+ // Semantic search (default)
208
+ const semanticResults = await kb.query('machine learning algorithms', {
209
+ searchType: 'semantic',
210
+ limit: 5,
211
+ });
212
+
213
+ // Keyword search
214
+ const keywordResults = await kb.query('machine learning algorithms', {
215
+ searchType: 'keyword',
216
+ limit: 5,
217
+ });
218
+
219
+ // Hybrid search (recommended)
220
+ const hybridResults = await kb.query('machine learning algorithms', {
221
+ searchType: 'hybrid',
222
+ semanticWeight: 0.7, // 70% semantic, 30% keyword
223
+ limit: 5,
224
+ });
225
+ ```
226
+
227
+ **Search Types:**
228
+ - `semantic` — Vector similarity search (default)
229
+ - `keyword` — Text matching search
230
+ - `hybrid` — Combined semantic + keyword search
231
+
78
232
  ## Providers
79
233
 
80
234
  ### MemoryProvider
@@ -121,6 +275,133 @@ new MarkdownSource('./docs/**/*.md', {
121
275
  - Code block detection (`hasCode` metadata)
122
276
  - Deterministic chunk IDs
123
277
 
278
+ ### WebUrlSource
279
+
280
+ Crawl and index web pages with HTML parsing.
281
+
282
+ ```typescript
283
+ new WebUrlSource(['https://example.com', 'https://docs.example.com'], {
284
+ maxDepth: 2, // Crawl depth (default: 1)
285
+ delayMs: 1000, // Delay between requests (default: 1000ms)
286
+ userAgent: 'MyApp/1.0', // Custom user agent
287
+ maxChunkSize: 2000, // Max tokens per chunk
288
+ chunkOverlap: 200, // Overlap between chunks
289
+ timeoutMs: 30000, // Request timeout (default: 30000ms)
290
+ sameDomainOnly: true, // Only follow links on the same domain (default: true)
291
+ maxPagesPerDomain: 10, // Max pages crawled per domain (default: 10)
292
+ namespace: 'web', // Chunk ID prefix
293
+ metadata: { source: 'web' }, // Added to all chunks
294
+ })
295
+ ```
296
+
297
+ **Features:**
298
+ - Recursive website crawling
299
+ - Automatic HTML text extraction
300
+ - Link discovery and following
301
+ - Respectful crawling with delays
302
+ - Error handling for failed requests
303
+
304
+ ### ApiDataSource
305
+
306
+ Index data from REST APIs with pagination.
307
+
308
+ ```typescript
309
+ new ApiDataSource('https://api.example.com/data', {
310
+ method: 'GET', // HTTP method (default: 'GET')
311
+ headers: { // Request headers
312
+ 'Authorization': 'Bearer token',
313
+ 'Content-Type': 'application/json',
314
+ },
315
+ body: JSON.stringify({}), // Request body for POST
316
+ pagination: { // Pagination config
317
+ param: 'page', // Query param name
318
+ start: 1, // Starting page number
319
+ step: 1, // Page increment
320
+ maxPages: 10, // Max pages to fetch
321
+ },
322
+ dataPath: 'data.items', // JSON path to data array
323
+ contentExtractor: (item) => // Custom content extraction
324
+ `${item.title}\n\n${item.description}`,
325
+ metadataExtractor: (item) => ({ // Custom metadata extraction
326
+ id: item.id,
327
+ category: item.category,
328
+ }),
329
+ maxChunkSize: 2000, // Max tokens per chunk
330
+ chunkOverlap: 200, // Overlap between chunks
331
+ timeoutMs: 30000, // Request timeout
332
+ namespace: 'api', // Chunk ID prefix
333
+ metadata: { source: 'api' }, // Added to all chunks
334
+ })
335
+ ```
336
+
337
+ **Features:**
338
+ - REST API data ingestion
339
+ - Automatic pagination handling
340
+ - Custom data extractors
341
+ - JSON path support
342
+ - Flexible content transformation
343
+
344
+ ### JSONSource
345
+
346
+ Index data from local JSON files.
347
+
348
+ ```typescript
349
+ import { JSONSource } from '@toolpack-sdk/knowledge';
350
+
351
+ new JSONSource('./data/products.json', {
352
+ toContent: (item: any) => `${item.name}\n\n${item.description}`, // Required
353
+ filter: (item: any) => item.active === true, // Optional: filter items
354
+ chunkSize: 100, // Items per chunk (default: 100)
355
+ namespace: 'products',
356
+ metadata: { source: 'products-db' },
357
+ })
358
+ ```
359
+
360
+ **Features:**
361
+ - Parses JSON arrays (or single objects)
362
+ - Optional item-level filtering
363
+ - Required `toContent` callback to control what gets embedded
364
+
365
+ ### SQLiteSource
366
+
367
+ Index rows from a SQLite database. Requires `better-sqlite3`.
368
+
369
+ ```typescript
370
+ import { SQLiteSource } from '@toolpack-sdk/knowledge';
371
+
372
+ new SQLiteSource('./data/app.db', {
373
+ query: 'SELECT id, title, body FROM articles WHERE published = 1', // Optional: defaults to all rows
374
+ toContent: (row) => `${row.title}\n\n${row.body}`, // Required
375
+ chunkSize: 50, // Rows per chunk (default: 100)
376
+ namespace: 'articles',
377
+ metadata: { source: 'sqlite' },
378
+ preLoadCSV: { // Optional: load a CSV into the DB before querying
379
+ tableName: 'articles',
380
+ csvPath: './data/articles.csv',
381
+ delimiter: ',',
382
+ headers: true,
383
+ },
384
+ })
385
+ ```
386
+
387
+ ### PostgresSource
388
+
389
+ Index rows from a PostgreSQL database. Requires `pg`.
390
+
391
+ ```typescript
392
+ import { PostgresSource } from '@toolpack-sdk/knowledge';
393
+
394
+ new PostgresSource({
395
+ connectionString: process.env.DATABASE_URL, // or use host/port/database/user/password
396
+ query: 'SELECT id, title, content FROM docs WHERE status = $1',
397
+ toContent: (row) => `${row.title}\n\n${row.content}`, // Required
398
+ chunkSize: 50,
399
+ namespace: 'docs',
400
+ metadata: { source: 'postgres' },
401
+ ssl: true,
402
+ })
403
+ ```
404
+
124
405
  ## Embedders
125
406
 
126
407
  ### OllamaEmbedder
@@ -129,11 +410,34 @@ Local embeddings via Ollama. Zero API cost.
129
410
 
130
411
  ```typescript
131
412
  new OllamaEmbedder({
132
- model: 'nomic-embed-text', // or 'mxbai-embed-large'
413
+ model: 'nomic-embed-text', // or 'mxbai-embed-large', 'all-minilm', 'bge-m3', etc.
133
414
  baseUrl: 'http://localhost:11434', // default
415
+ dimensions: 768, // optional: override auto-detected dimensions
416
+ retries: 3, // default
417
+ retryDelay: 1000, // ms, default
134
418
  })
135
419
  ```
136
420
 
421
+ Known models: `nomic-embed-text` (768), `mxbai-embed-large` (1024), `all-minilm` (384), `snowflake-arctic-embed` (1024), `bge-m3` (1024), `bge-large` (1024). Pass `dimensions` for any other model.
422
+
423
+ ### OpenRouterEmbedder
424
+
425
+ Embeddings via OpenRouter, giving access to OpenAI embedding models through a single API key.
426
+
427
+ ```typescript
428
+ import { OpenRouterEmbedder } from '@toolpack-sdk/knowledge';
429
+
430
+ new OpenRouterEmbedder({
431
+ model: 'openai/text-embedding-3-small', // or 'openai/text-embedding-3-large', 'openai/text-embedding-ada-002'
432
+ apiKey: process.env.OPENROUTER_API_KEY!,
433
+ dimensions: 1536, // optional: override auto-detected dimensions
434
+ retries: 3, // default
435
+ retryDelay: 1000, // ms, default
436
+ })
437
+ ```
438
+
439
+ Known models: `openai/text-embedding-3-small` (1536), `openai/text-embedding-3-large` (3072), `openai/text-embedding-ada-002` (1536). Pass `dimensions` for any other model.
440
+
137
441
  ### OpenAIEmbedder
138
442
 
139
443
  OpenAI text-embedding models with retry logic.
@@ -159,6 +463,7 @@ interface KnowledgeOptions {
159
463
  embedder: Embedder;
160
464
  description: string; // Required: used as tool description
161
465
  reSync?: boolean; // default: true
466
+ streamingBatchSize?: number; // Process chunks in batches (default: 100)
162
467
  onError?: (error, context) => 'skip' | 'abort';
163
468
  onSync?: (event: SyncEvent) => void;
164
469
  onEmbeddingProgress?: (event: EmbeddingProgressEvent) => void;
@@ -171,6 +476,8 @@ interface KnowledgeOptions {
171
476
  await kb.query('search query', {
172
477
  limit: 10, // Max results
173
478
  threshold: 0.7, // Similarity threshold (0-1)
479
+ searchType: 'hybrid', // 'semantic' | 'keyword' | 'hybrid' (default: 'semantic')
480
+ semanticWeight: 0.7, // Weight for semantic vs keyword in hybrid search (0-1)
174
481
  filter: { // Metadata filters
175
482
  hasCode: true,
176
483
  category: { $in: ['api', 'guide'] },
@@ -180,6 +487,20 @@ await kb.query('search query', {
180
487
  });
181
488
  ```
182
489
 
490
+ ### Utility Functions
491
+
492
+ ```typescript
493
+ import { keywordSearch, combineScores } from '@toolpack-sdk/knowledge';
494
+
495
+ // Manual keyword search
496
+ const score = keywordSearch('document content', 'search query');
497
+ // Returns: number between 0-1
498
+
499
+ // Combine semantic and keyword scores
500
+ const combinedScore = combineScores(semanticScore, keywordScore, 0.7);
501
+ // Returns: weighted combination
502
+ ```
503
+
183
504
  ### Metadata Filters
184
505
 
185
506
  ```typescript