@lov3kaizen/agentsea-embeddings 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 lovekaizen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,475 @@
1
+ # @lov3kaizen/agentsea-embeddings
2
+
3
+ Vector embedding lifecycle management toolkit for Node.js. Handles versioning, caching, chunking, drift detection, and migration across embedding models.
4
+
5
+ ## Features
6
+
7
+ - **Multiple Providers**: OpenAI, Cohere, Voyage AI, HuggingFace, and local models
8
+ - **Smart Chunking**: Fixed, recursive, semantic, markdown-aware, and code-aware strategies
9
+ - **Multi-tier Caching**: Memory, Redis, SQLite, and tiered caching
10
+ - **Version Management**: Track embedding versions and plan migrations
11
+ - **Drift Detection**: Monitor embedding quality and detect distribution drift
12
+ - **Vector Stores**: Pinecone, Chroma, Qdrant, and in-memory adapters
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ npm install @lov3kaizen/agentsea-embeddings
18
+ # or
19
+ pnpm add @lov3kaizen/agentsea-embeddings
20
+ ```
21
+
22
+ ## Quick Start
23
+
24
+ ### Basic Embedding
25
+
26
+ ```typescript
27
+ import {
28
+ createEmbeddingManager,
29
+ createOpenAIProvider,
30
+ createMemoryCache,
31
+ createMemoryStore,
32
+ } from '@lov3kaizen/agentsea-embeddings';
33
+
34
+ // Create provider
35
+ const provider = createOpenAIProvider({
36
+ apiKey: process.env.OPENAI_API_KEY!,
37
+ model: 'text-embedding-3-small',
38
+ });
39
+
40
+ // Create manager
41
+ const manager = createEmbeddingManager({
42
+ defaultModel: 'text-embedding-3-small',
43
+ defaultProvider: 'openai',
44
+ });
45
+
46
+ // Register provider and configure
47
+ manager.registerModel(provider, true);
48
+ manager.setCache(createMemoryCache());
49
+ manager.setStore(createMemoryStore({ type: 'memory', dimensions: 1536 }));
50
+
51
+ // Generate embedding
52
+ const result = await manager.embed('Hello, world!');
53
+ console.log('Dimensions:', result.dimensions);
54
+ console.log('Tokens:', result.tokenCount);
55
+
56
+ // Batch embedding
57
+ const batchResult = await manager.embedBatch([
58
+ 'First document',
59
+ 'Second document',
60
+ 'Third document',
61
+ ]);
62
+ console.log('Embedded:', batchResult.results.length, 'documents');
63
+ ```
64
+
65
+ ### Document Chunking & Embedding
66
+
67
+ ```typescript
68
+ import {
69
+ createEmbeddingManager,
70
+ createOpenAIProvider,
71
+ createRecursiveChunker,
72
+ } from '@lov3kaizen/agentsea-embeddings';
73
+
74
+ const manager = createEmbeddingManager();
75
+ const provider = createOpenAIProvider({
76
+ apiKey: process.env.OPENAI_API_KEY!,
77
+ });
78
+
79
+ manager.registerModel(provider, true);
80
+ manager.setChunker(createRecursiveChunker());
81
+
82
+ // Embed a long document
83
+ const document = `
84
+ # Introduction
85
+
86
+ This is a long document that needs to be chunked...
87
+
88
+ ## Section 1
89
+
90
+ Content for section 1...
91
+
92
+ ## Section 2
93
+
94
+ Content for section 2...
95
+ `;
96
+
97
+ const chunks = await manager.embedDocument(document, {
98
+ documentId: 'doc-1',
99
+ source: 'example.md',
100
+ type: 'markdown',
101
+ });
102
+
103
+ console.log('Created', chunks.length, 'chunks');
104
+ ```
105
+
106
+ ### Semantic Search
107
+
108
+ ```typescript
109
+ const results = await manager.search('What is the main topic?', {
110
+ topK: 5,
111
+ minScore: 0.7,
112
+ });
113
+
114
+ for (const result of results) {
115
+ console.log(`[${result.score.toFixed(3)}] ${result.text.slice(0, 100)}...`);
116
+ }
117
+ ```
118
+
119
+ ## Providers
120
+
121
+ ### OpenAI
122
+
123
+ ```typescript
124
+ import { createOpenAIProvider } from '@lov3kaizen/agentsea-embeddings';
125
+
126
+ const provider = createOpenAIProvider({
127
+ apiKey: process.env.OPENAI_API_KEY!,
128
+ model: 'text-embedding-3-small', // or 'text-embedding-3-large', 'text-embedding-ada-002'
129
+ dimensions: 1536, // optional dimension reduction for v3 models
130
+ });
131
+ ```
132
+
133
+ ### Cohere
134
+
135
+ ```typescript
136
+ import { createCohereProvider } from '@lov3kaizen/agentsea-embeddings';
137
+
138
+ const provider = createCohereProvider({
139
+ apiKey: process.env.COHERE_API_KEY!,
140
+ model: 'embed-english-v3.0',
141
+ inputType: 'search_document', // or 'search_query', 'classification', 'clustering'
142
+ });
143
+ ```
144
+
145
+ ### Voyage AI
146
+
147
+ ```typescript
148
+ import { createVoyageProvider } from '@lov3kaizen/agentsea-embeddings';
149
+
150
+ const provider = createVoyageProvider({
151
+ apiKey: process.env.VOYAGE_API_KEY!,
152
+ model: 'voyage-3', // or 'voyage-code-3', 'voyage-finance-2', etc.
153
+ });
154
+ ```
155
+
156
+ ### HuggingFace
157
+
158
+ ```typescript
159
+ import { createHuggingFaceProvider } from '@lov3kaizen/agentsea-embeddings';
160
+
161
+ const provider = createHuggingFaceProvider({
162
+ apiKey: process.env.HF_API_KEY!,
163
+ model: 'sentence-transformers/all-MiniLM-L6-v2',
164
+ });
165
+ ```
166
+
167
+ ### Local/Custom
168
+
169
+ ```typescript
170
+ import { createLocalProvider } from '@lov3kaizen/agentsea-embeddings';
171
+
172
+ const provider = createLocalProvider({
173
+ dimensions: 384,
174
+ name: 'custom-model',
175
+ embedFn: async (texts) => {
176
+ // Your custom embedding logic
177
+ return texts.map((text) => new Array(384).fill(0).map(() => Math.random()));
178
+ },
179
+ });
180
+ ```
181
+
182
+ ## Chunking Strategies
183
+
184
+ ### Fixed Size
185
+
186
+ ```typescript
187
+ import { createFixedChunker } from '@lov3kaizen/agentsea-embeddings';
188
+
189
+ const chunker = createFixedChunker();
190
+ const chunks = await chunker.chunk(text, {
191
+ chunkSize: 512,
192
+ chunkOverlap: 50,
193
+ });
194
+ ```
195
+
196
+ ### Recursive
197
+
198
+ ```typescript
199
+ import { createRecursiveChunker } from '@lov3kaizen/agentsea-embeddings';
200
+
201
+ const chunker = createRecursiveChunker();
202
+ const chunks = await chunker.chunk(text, {
203
+ chunkSize: 512,
204
+ separators: ['\n\n', '\n', '. ', ' '],
205
+ });
206
+ ```
207
+
208
+ ### Markdown-Aware
209
+
210
+ ```typescript
211
+ import { createMarkdownChunker } from '@lov3kaizen/agentsea-embeddings';
212
+
213
+ const chunker = createMarkdownChunker();
214
+ const chunks = await chunker.chunk(markdownText, {
215
+ preserveHeaders: true,
216
+ includeHeaderHierarchy: true,
217
+ });
218
+ ```
219
+
220
+ ### Code-Aware
221
+
222
+ ```typescript
223
+ import { createCodeChunker } from '@lov3kaizen/agentsea-embeddings';
224
+
225
+ const chunker = createCodeChunker();
226
+ const chunks = await chunker.chunk(sourceCode, {
227
+ language: 'typescript',
228
+ splitBy: 'function',
229
+ includeImports: true,
230
+ });
231
+ ```
232
+
233
+ ### Semantic
234
+
235
+ ```typescript
236
+ import { createSemanticChunker } from '@lov3kaizen/agentsea-embeddings';
237
+
238
+ const chunker = createSemanticChunker();
239
+ const chunks = await chunker.chunk(text, {
240
+ similarityThreshold: 0.5,
241
+ embeddingFn: async (texts) =>
242
+ provider.embedBatch(texts).then((r) => r.results.map((e) => e.vector)),
243
+ });
244
+ ```
245
+
246
+ ## Caching
247
+
248
+ ### Memory Cache
249
+
250
+ ```typescript
251
+ import { createMemoryCache } from '@lov3kaizen/agentsea-embeddings';
252
+
253
+ const cache = createMemoryCache({
254
+ maxEntries: 10000,
255
+ maxAge: 3600000, // 1 hour
256
+ });
257
+ ```
258
+
259
+ ### Redis Cache
260
+
261
+ ```typescript
262
+ import { createRedisCache } from '@lov3kaizen/agentsea-embeddings';
263
+
264
+ const cache = createRedisCache({
265
+ url: 'redis://localhost:6379',
266
+ keyPrefix: 'emb',
267
+ defaultTTL: 86400, // 24 hours
268
+ });
269
+
270
+ await cache.connect();
271
+ ```
272
+
273
+ ### SQLite Cache
274
+
275
+ ```typescript
276
+ import { createSQLiteCache } from '@lov3kaizen/agentsea-embeddings';
277
+
278
+ const cache = createSQLiteCache({
279
+ dbPath: './embeddings.db',
280
+ walMode: true,
281
+ });
282
+
283
+ await cache.init();
284
+ ```
285
+
286
+ ### Tiered Cache
287
+
288
+ ```typescript
289
+ import { createStandardTieredCache } from '@lov3kaizen/agentsea-embeddings';
290
+
291
+ const cache = createStandardTieredCache({
292
+ memoryMaxEntries: 1000,
293
+ persistentPath: './embeddings.db',
294
+ });
295
+ ```
296
+
297
+ ## Vector Stores
298
+
299
+ ### Memory Store
300
+
301
+ ```typescript
302
+ import { createMemoryStore } from '@lov3kaizen/agentsea-embeddings';
303
+
304
+ const store = createMemoryStore({
305
+ dimensions: 1536,
306
+ metric: 'cosine',
307
+ });
308
+ ```
309
+
310
+ ### Pinecone
311
+
312
+ ```typescript
313
+ import { createPineconeStore } from '@lov3kaizen/agentsea-embeddings';
314
+
315
+ const store = createPineconeStore({
316
+ apiKey: process.env.PINECONE_API_KEY!,
317
+ indexName: 'my-index',
318
+ namespace: 'default',
319
+ });
320
+
321
+ await store.init();
322
+ ```
323
+
324
+ ### Chroma
325
+
326
+ ```typescript
327
+ import { createChromaStore } from '@lov3kaizen/agentsea-embeddings';
328
+
329
+ const store = createChromaStore({
330
+ url: 'http://localhost:8000',
331
+ collectionName: 'my-collection',
332
+ });
333
+
334
+ await store.init();
335
+ ```
336
+
337
+ ### Qdrant
338
+
339
+ ```typescript
340
+ import { createQdrantStore } from '@lov3kaizen/agentsea-embeddings';
341
+
342
+ const store = createQdrantStore({
343
+ url: 'http://localhost:6333',
344
+ collectionName: 'my-collection',
345
+ dimensions: 1536,
346
+ });
347
+
348
+ await store.init();
349
+ ```
350
+
351
+ ## Version Management
352
+
353
+ ```typescript
354
+ import { createVersionRegistry } from '@lov3kaizen/agentsea-embeddings';
355
+
356
+ const registry = createVersionRegistry();
357
+
358
+ // Register versions
359
+ const v1 = registry.register({
360
+ name: 'v1',
361
+ provider: 'openai',
362
+ model: 'text-embedding-ada-002',
363
+ dimensions: 1536,
364
+ });
365
+
366
+ const v2 = registry.register({
367
+ name: 'v2',
368
+ provider: 'openai',
369
+ model: 'text-embedding-3-small',
370
+ dimensions: 1536,
371
+ });
372
+
373
+ // Activate version
374
+ registry.activate(v2.id);
375
+
376
+ // Compare versions
377
+ const comparison = registry.compare(v1.id, v2.id);
378
+ console.log('Migration required:', comparison.migrationRequired);
379
+
380
+ // Deprecate old version
381
+ registry.deprecate(v1.id, 'Replaced by v2', v2.id);
382
+ ```
383
+
384
+ ## Drift Detection
385
+
386
+ ```typescript
387
+ import { createDriftDetector } from '@lov3kaizen/agentsea-embeddings';
388
+
389
+ const detector = createDriftDetector({
390
+ driftThreshold: 0.1,
391
+ alertSeverity: 'medium',
392
+ });
393
+
394
+ // Set reference distribution
395
+ const referenceEmbeddings = await manager.embedBatch(referenceTexts);
396
+ detector.setReference(
397
+ referenceEmbeddings.results.map((r) => r.vector),
398
+ 'text-embedding-3-small',
399
+ );
400
+
401
+ // Monitor for drift
402
+ detector.on('drift:detected', (result) => {
403
+ console.log('Drift detected!', result.severity, result.driftScore);
404
+ });
405
+
406
+ // Add samples for monitoring
407
+ for (const embedding of newEmbeddings) {
408
+ detector.addSample(embedding.vector);
409
+ }
410
+
411
+ // Or detect manually
412
+ const currentEmbeddings = await manager.embedBatch(currentTexts);
413
+ const driftResult = detector.detect(
414
+ currentEmbeddings.results.map((r) => r.vector),
415
+ );
416
+ ```
417
+
418
+ ## API Reference
419
+
420
+ ### EmbeddingManager
421
+
422
+ - `registerModel(model, isDefault?)` - Register an embedding model
423
+ - `embed(text, options?)` - Embed a single text
424
+ - `embedBatch(texts, options?)` - Embed multiple texts
425
+ - `embedDocument(text, options?)` - Chunk and embed a document
426
+ - `search(query, options?)` - Search for similar content
427
+ - `similarity(text1, text2)` - Calculate similarity between texts
428
+ - `setCache(cache)` - Set cache implementation
429
+ - `setChunker(chunker)` - Set chunker implementation
430
+ - `setStore(store)` - Set store implementation
431
+ - `getStats()` - Get embedding statistics
432
+
433
+ ### Providers
434
+
435
+ All providers implement:
436
+
437
+ - `embed(text, options?)` - Embed single text
438
+ - `embedBatch(texts, options?)` - Batch embedding
439
+ - `countTokens(text)` - Count tokens
440
+ - `getMetrics()` - Get provider metrics
441
+ - `getHealth()` - Check provider health
442
+
443
+ ### Chunkers
444
+
445
+ All chunkers implement:
446
+
447
+ - `chunk(text, options?)` - Chunk text
448
+ - `chunkWithResult(text, options?)` - Chunk with metadata
449
+
450
+ ### Caches
451
+
452
+ All caches implement:
453
+
454
+ - `get(key)` - Get cached embedding
455
+ - `set(key, entry)` - Cache embedding
456
+ - `has(key)` - Check if key exists
457
+ - `delete(key)` - Delete entry
458
+ - `clear()` - Clear all entries
459
+ - `lookup(key)` - Lookup with stats
460
+ - `getStats()` - Get cache statistics
461
+
462
+ ### Stores
463
+
464
+ All stores implement:
465
+
466
+ - `upsert(records, options?)` - Upsert vectors
467
+ - `query(vector, options?)` - Query similar vectors
468
+ - `delete(ids, options?)` - Delete vectors
469
+ - `deleteAll(options?)` - Delete all vectors
470
+ - `getStats()` - Get store statistics
471
+ - `checkHealth()` - Check store health
472
+
473
+ ## License
474
+
475
+ MIT