@yamo/memory-mesh 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +80 -0
  3. package/bin/memory_mesh.js +69 -0
  4. package/bin/scrubber.js +81 -0
  5. package/index.d.ts +111 -0
  6. package/lib/adapters/index.js +3 -0
  7. package/lib/embeddings/factory.js +150 -0
  8. package/lib/embeddings/index.js +2 -0
  9. package/lib/embeddings/service.js +586 -0
  10. package/lib/index.js +18 -0
  11. package/lib/lancedb/client.js +631 -0
  12. package/lib/lancedb/config.js +215 -0
  13. package/lib/lancedb/errors.js +144 -0
  14. package/lib/lancedb/index.js +4 -0
  15. package/lib/lancedb/schema.js +197 -0
  16. package/lib/memory/index.js +3 -0
  17. package/lib/memory/memory-context-manager.js +388 -0
  18. package/lib/memory/memory-mesh.js +910 -0
  19. package/lib/memory/memory-translator.js +130 -0
  20. package/lib/memory/migrate-memory.js +227 -0
  21. package/lib/memory/migrate-to-v2.js +120 -0
  22. package/lib/memory/scorer.js +85 -0
  23. package/lib/memory/vector-memory.js +364 -0
  24. package/lib/privacy/audit-logger.js +176 -0
  25. package/lib/privacy/dlp-redactor.js +72 -0
  26. package/lib/privacy/index.js +10 -0
  27. package/lib/reporting/skill-report-generator.js +283 -0
  28. package/lib/scrubber/.gitkeep +1 -0
  29. package/lib/scrubber/config/defaults.js +62 -0
  30. package/lib/scrubber/errors/scrubber-error.js +43 -0
  31. package/lib/scrubber/index.js +25 -0
  32. package/lib/scrubber/scrubber.js +130 -0
  33. package/lib/scrubber/stages/chunker.js +103 -0
  34. package/lib/scrubber/stages/metadata-annotator.js +74 -0
  35. package/lib/scrubber/stages/normalizer.js +59 -0
  36. package/lib/scrubber/stages/semantic-filter.js +61 -0
  37. package/lib/scrubber/stages/structural-cleaner.js +82 -0
  38. package/lib/scrubber/stages/validator.js +66 -0
  39. package/lib/scrubber/telemetry.js +66 -0
  40. package/lib/scrubber/utils/hash.js +39 -0
  41. package/lib/scrubber/utils/html-parser.js +45 -0
  42. package/lib/scrubber/utils/pattern-matcher.js +63 -0
  43. package/lib/scrubber/utils/token-counter.js +31 -0
  44. package/lib/search/filter.js +275 -0
  45. package/lib/search/hybrid.js +137 -0
  46. package/lib/search/index.js +3 -0
  47. package/lib/search/pattern-miner.js +160 -0
  48. package/lib/utils/error-sanitizer.js +84 -0
  49. package/lib/utils/handoff-validator.js +85 -0
  50. package/lib/utils/index.js +4 -0
  51. package/lib/utils/spinner.js +190 -0
  52. package/lib/utils/streaming-client.js +128 -0
  53. package/package.json +39 -0
  54. package/skills/SKILL.md +462 -0
  55. package/skills/skill-scrubber.yamo +41 -0
@@ -0,0 +1,586 @@
1
+ /**
2
+ * EmbeddingService - Multi-provider embedding generation service
3
+ *
4
+ * Supports:
5
+ * - Local models: Xenova/Transformers.js (ONNX runtime)
6
+ * - Ollama: Local Ollama embeddings API
7
+ * - API models: OpenAI, Cohere
8
+ *
9
+ * Implements TDD for Phase 3, Task 3.1 - Embedding Service Architecture
10
+ */
11
+
12
+ import crypto from "crypto";
13
+ import {
14
+ ConfigurationError,
15
+ EmbeddingError,
16
+ StorageError,
17
+ QueryError
18
+ } from "../lancedb/errors.js";
19
+
20
+ /**
21
+ * EmbeddingService provides a unified interface for generating text embeddings
22
+ * using multiple backend providers (local ONNX models or cloud APIs).
23
+ */
24
+ class EmbeddingService {
25
+ /**
26
+ * Create a new EmbeddingService instance
27
+ * @param {Object} [config={}] - Configuration options
28
+ * @param {string} [config.modelType] - Type of model ('local', 'ollama', 'openai', 'cohere')
29
+ * @param {string} [config.modelName] - Name of the model to use
30
+ * @param {string} [config.baseUrl] - Base URL for Ollama (default: http://localhost:11434)
31
+ * @param {number} [config.dimension] - Embedding dimension
32
+ * @param {number} [config.batchSize] - Maximum batch size for processing
33
+ * @param {boolean} [config.normalize] - Whether to normalize embeddings to unit length
34
+ * @param {number} [config.cacheMaxSize] - Maximum size of LRU cache
35
+ * @param {string} [config.apiKey] - API key for cloud providers
36
+ */
37
+ // @ts-ignore
38
+ constructor(config = {}) {
39
+ this.modelType = (config && config.modelType) || process.env.EMBEDDING_MODEL_TYPE || 'local';
40
+ this.modelName = (config && config.modelName) || process.env.EMBEDDING_MODEL_NAME || 'Xenova/all-MiniLM-L6-v2';
41
+ this.baseUrl = (config && config.baseUrl) || process.env.OLLAMA_BASE_URL || process.env.EMBEDDING_BASE_URL || 'http://localhost:11434';
42
+ this.dimension = (config && config.dimension) || parseInt(process.env.EMBEDDING_DIMENSION || '384') || 384;
43
+ this.batchSize = (config && config.batchSize) || parseInt(process.env.EMBEDDING_BATCH_SIZE || '32') || 32;
44
+ this.normalize = (config && config.normalize !== undefined) ? config.normalize : (process.env.EMBEDDING_NORMALIZE !== 'false');
45
+
46
+ this.apiKey = (config && config.apiKey) || process.env.EMBEDDING_API_KEY;
47
+
48
+ this.model = null;
49
+ this.cache = new Map();
50
+ this.cacheMaxSize = (config && config.cacheMaxSize) || 1000;
51
+ this.initialized = false;
52
+
53
+ // Statistics
54
+ this.stats = {
55
+ totalEmbeddings: 0,
56
+ cacheHits: 0,
57
+ cacheMisses: 0,
58
+ batchCount: 0
59
+ };
60
+ }
61
+
62
+ /**
63
+ * Initialize the embedding model
64
+ * Loads the model based on modelType (local, ollama, openai, cohere)
65
+ */
66
+ async init() {
67
+ try {
68
+ switch (this.modelType) {
69
+ case 'local':
70
+ await this._initLocalModel();
71
+ break;
72
+ case 'ollama':
73
+ await this._initOllama();
74
+ break;
75
+ case 'openai':
76
+ await this._initOpenAI();
77
+ break;
78
+ case 'cohere':
79
+ await this._initCohere();
80
+ break;
81
+ default:
82
+ throw new ConfigurationError(
83
+ `Unknown model type: ${this.modelType}. Must be 'local', 'ollama', 'openai', or 'cohere'`,
84
+ { modelType: this.modelType }
85
+ );
86
+ }
87
+
88
+ this.initialized = true;
89
+ } catch (error) {
90
+ if (error instanceof ConfigurationError) {
91
+ throw error;
92
+ }
93
+ const message = error instanceof Error ? error.message : String(error);
94
+ throw new EmbeddingError(
95
+ `Failed to initialize embedding service: ${message}`,
96
+ { modelType: this.modelType, modelName: this.modelName, originalError: message }
97
+ );
98
+ }
99
+ }
100
+
101
+ /**
102
+ * Generate embedding for a single text
103
+ * @param {string} text - Text to embed
104
+ * @param {Object} options - Options for embedding generation
105
+ * @returns {Promise<number[]>} Embedding vector
106
+ */
107
+ async embed(text, options = {}) {
108
+ if (!this.initialized) {
109
+ throw new EmbeddingError(
110
+ 'Embedding service not initialized. Call init() first.',
111
+ { modelType: this.modelType }
112
+ );
113
+ }
114
+
115
+ if (!text || typeof text !== 'string') {
116
+ throw new EmbeddingError(
117
+ 'Text must be a non-empty string',
118
+ { text, textType: typeof text }
119
+ );
120
+ }
121
+
122
+ // Check cache
123
+ const cacheKey = this._getCacheKey(text);
124
+ const cached = this.cache.get(cacheKey);
125
+ if (cached) {
126
+ this.stats.cacheHits++;
127
+ return cached;
128
+ }
129
+
130
+ // Generate embedding
131
+ let embedding;
132
+ try {
133
+ switch (this.modelType) {
134
+ case 'local':
135
+ embedding = await this._embedLocal(text);
136
+ break;
137
+ case 'ollama':
138
+ embedding = await this._embedOllama(text);
139
+ break;
140
+ case 'openai':
141
+ embedding = await this._embedOpenAI(text);
142
+ break;
143
+ case 'cohere':
144
+ embedding = await this._embedCohere(text);
145
+ break;
146
+ default:
147
+ throw new EmbeddingError(
148
+ `Unknown model type: ${this.modelType}`,
149
+ { modelType: this.modelType }
150
+ );
151
+ }
152
+
153
+ // Normalize if enabled
154
+ if (this.normalize) {
155
+ embedding = this._normalize(embedding);
156
+ }
157
+
158
+ // Cache result
159
+ this._setCache(cacheKey, embedding);
160
+
161
+ this.stats.totalEmbeddings++;
162
+ this.stats.cacheMisses++;
163
+
164
+ return embedding;
165
+ } catch (error) {
166
+ if (error instanceof EmbeddingError) {
167
+ throw error;
168
+ }
169
+ const message = error instanceof Error ? error.message : String(error);
170
+ throw new EmbeddingError(
171
+ `Failed to generate embedding: ${message}`,
172
+ { modelType: this.modelType, text: text.substring(0, 100) }
173
+ );
174
+ }
175
+ }
176
+
177
+ /**
178
+ * Generate embeddings for a batch of texts
179
+ * @param {string[]} texts - Array of texts to embed
180
+ * @param {Object} options - Options for embedding generation
181
+ * @returns {Promise<number[][]>} Array of embedding vectors
182
+ */
183
+ async embedBatch(texts, options = {}) {
184
+ if (!this.initialized) {
185
+ throw new EmbeddingError(
186
+ 'Embedding service not initialized. Call init() first.',
187
+ { modelType: this.modelType }
188
+ );
189
+ }
190
+
191
+ if (!Array.isArray(texts)) {
192
+ throw new EmbeddingError(
193
+ 'Texts must be an array',
194
+ { textsType: typeof texts }
195
+ );
196
+ }
197
+
198
+ if (texts.length === 0) {
199
+ return [];
200
+ }
201
+
202
+ try {
203
+ const embeddings = [];
204
+
205
+ // Process in batches
206
+ for (let i = 0; i < texts.length; i += this.batchSize) {
207
+ const batch = texts.slice(i, Math.min(i + this.batchSize, texts.length));
208
+
209
+ // Generate embeddings for batch
210
+ const batchEmbeddings = await Promise.all(
211
+ batch.map(text => this.embed(text, options))
212
+ );
213
+
214
+ embeddings.push(...batchEmbeddings);
215
+ this.stats.batchCount++;
216
+ }
217
+
218
+ return embeddings;
219
+ } catch (error) {
220
+ if (error instanceof EmbeddingError) {
221
+ throw error;
222
+ }
223
+ const message = error instanceof Error ? error.message : String(error);
224
+ throw new EmbeddingError(
225
+ `Failed to generate batch embeddings: ${message}`,
226
+ { modelType: this.modelType, batchSize: texts.length }
227
+ );
228
+ }
229
+ }
230
+
231
+ /**
232
+ * Initialize local ONNX model using Xenova/Transformers.js
233
+ * @private
234
+ */
235
+ async _initLocalModel() {
236
+ try {
237
+ // Dynamic import to allow optional dependency
238
+ const { pipeline } = await import("@xenova/transformers");
239
+
240
+ // Load feature extraction pipeline
241
+ // @ts-ignore
242
+ this.model = await pipeline(
243
+ 'feature-extraction',
244
+ this.modelName,
245
+ {
246
+ quantized: true,
247
+ progress_callback: (progress) => {
248
+ // Optional: Log model download progress
249
+ if (progress.status === 'downloading') {
250
+ // Silently handle progress
251
+ }
252
+ }
253
+ }
254
+ );
255
+
256
+ // Update dimension based on model (384 for all-MiniLM-L6-v2)
257
+ if (this.modelName.includes('all-MiniLM-L6-v2')) {
258
+ this.dimension = 384;
259
+ }
260
+ } catch (error) {
261
+ const message = error instanceof Error ? error.message : String(error);
262
+ throw new ConfigurationError(
263
+ `Failed to load local model: ${message}. Make sure @xenova/transformers is installed.`,
264
+ { modelName: this.modelName, error: message }
265
+ );
266
+ }
267
+ }
268
+
269
+ /**
270
+ * Initialize Ollama client
271
+ * Ollama runs locally and doesn't require authentication
272
+ * @private
273
+ */
274
+ async _initOllama() {
275
+ // Ollama doesn't require initialization - it's a local HTTP API
276
+ // Store the base URL for use in _embedOllama
277
+ this.model = {
278
+ baseUrl: this.baseUrl,
279
+ modelName: this.modelName || 'nomic-embed-text'
280
+ };
281
+
282
+ // Set default dimension for common Ollama embedding models
283
+ if (this.modelName.includes('nomic-embed-text')) {
284
+ this.dimension = 768;
285
+ } else if (this.modelName.includes('mxbai-embed')) {
286
+ this.dimension = 1024;
287
+ } else if (this.modelName.includes('all-MiniLM')) {
288
+ this.dimension = 384;
289
+ }
290
+ }
291
+
292
+ /**
293
+ * Initialize OpenAI client
294
+ * @private
295
+ */
296
+ async _initOpenAI() {
297
+ if (!this.apiKey) {
298
+ throw new ConfigurationError(
299
+ 'OpenAI API key is required. Set EMBEDDING_API_KEY environment variable or pass apiKey in config.',
300
+ { modelType: 'openai' }
301
+ );
302
+ }
303
+
304
+ try {
305
+ // Dynamic import to allow optional dependency
306
+ const { OpenAI } = await import("openai");
307
+ // @ts-ignore - OpenAI constructor
308
+ this.model = new OpenAI({ apiKey: this.apiKey });
309
+
310
+ // Update dimension for OpenAI models
311
+ if (this.modelName.includes('text-embedding-ada-002')) {
312
+ this.dimension = 1536;
313
+ }
314
+ } catch (error) {
315
+ const message = error instanceof Error ? error.message : String(error);
316
+ throw new ConfigurationError(
317
+ `Failed to initialize OpenAI client: ${message}. Make sure openai package is installed.`,
318
+ { error: message }
319
+ );
320
+ }
321
+ }
322
+
323
+ /**
324
+ * Initialize Cohere client
325
+ * @private
326
+ */
327
+ async _initCohere() {
328
+ if (!this.apiKey) {
329
+ throw new ConfigurationError(
330
+ 'Cohere API key is required. Set EMBEDDING_API_KEY environment variable or pass apiKey in config.',
331
+ { modelType: 'cohere' }
332
+ );
333
+ }
334
+
335
+ try {
336
+ // Dynamic import to allow optional dependency
337
+ const cohere = await import("cohere-ai");
338
+ // @ts-ignore - Cohere constructor
339
+ this.model = new cohere.CohereClient({ token: this.apiKey });
340
+
341
+ // Update dimension for Cohere models
342
+ if (this.modelName.includes('embed-english-v3.0')) {
343
+ this.dimension = 1024;
344
+ }
345
+ } catch (error) {
346
+ const message = error instanceof Error ? error.message : String(error);
347
+ throw new ConfigurationError(
348
+ `Failed to initialize Cohere client: ${message}. Make sure cohere-ai package is installed.`,
349
+ { error: message }
350
+ );
351
+ }
352
+ }
353
+
354
+ /**
355
+ * Generate embedding using local ONNX model
356
+ * @param {string} text - Text to embed
357
+ * @returns {Promise<number[]>} Embedding vector
358
+ * @private
359
+ */
360
+ async _embedLocal(text) {
361
+ if (!this.model) throw new EmbeddingError('Model not initialized');
362
+ try {
363
+ // @ts-ignore - Local model call
364
+ const output = await this.model(text, {
365
+ pooling: 'mean',
366
+ normalize: false
367
+ });
368
+
369
+ // Convert from tensor to array
370
+ const embedding = Array.from(output.data);
371
+ // @ts-ignore
372
+ return embedding;
373
+ } catch (error) {
374
+ const message = error instanceof Error ? error.message : String(error);
375
+ throw new EmbeddingError(
376
+ `Failed to generate local embedding: ${message}`,
377
+ { modelName: this.modelName, text: text.substring(0, 100) }
378
+ );
379
+ }
380
+ }
381
+
382
+ /**
383
+ * Generate embedding using Ollama API
384
+ * @param {string} text - Text to embed
385
+ * @returns {Promise<number[]>} Embedding vector
386
+ * @private
387
+ */
388
+ async _embedOllama(text) {
389
+ if (!this.model) throw new EmbeddingError('Model not initialized');
390
+ try {
391
+ // @ts-ignore - Ollama model object
392
+ const baseUrl = this.model.baseUrl;
393
+ // @ts-ignore
394
+ const modelName = this.model.modelName;
395
+
396
+ const response = await fetch(`${baseUrl}/api/embeddings`, {
397
+ method: 'POST',
398
+ headers: {
399
+ 'Content-Type': 'application/json'
400
+ },
401
+ body: JSON.stringify({
402
+ model: modelName,
403
+ prompt: text
404
+ })
405
+ });
406
+
407
+ if (!response.ok) {
408
+ const errorText = await response.text();
409
+ throw new EmbeddingError(
410
+ `Ollama API error: ${response.status} ${response.statusText} - ${errorText}`,
411
+ { baseUrl: baseUrl, modelName: modelName }
412
+ );
413
+ }
414
+
415
+ const data = await response.json();
416
+
417
+ if (!data.embedding) {
418
+ throw new EmbeddingError(
419
+ 'Invalid response from Ollama API: missing embedding field',
420
+ { response: data }
421
+ );
422
+ }
423
+
424
+ return data.embedding;
425
+ } catch (error) {
426
+ if (error instanceof EmbeddingError) {
427
+ throw error;
428
+ }
429
+ const message = error instanceof Error ? error.message : String(error);
430
+ // @ts-ignore
431
+ const baseUrl = this.model?.baseUrl;
432
+ // @ts-ignore
433
+ const modelName = this.model?.modelName;
434
+ throw new EmbeddingError(
435
+ `Failed to generate Ollama embedding: ${message}. Make sure Ollama is running and the model is available.`,
436
+ { baseUrl, modelName, error: message }
437
+ );
438
+ }
439
+ }
440
+
441
+ /**
442
+ * Generate embedding using OpenAI API
443
+ * @param {string} text - Text to embed
444
+ * @returns {Promise<number[]>} Embedding vector
445
+ * @private
446
+ */
447
+ async _embedOpenAI(text) {
448
+ if (!this.model) throw new EmbeddingError('Model not initialized');
449
+ try {
450
+ // @ts-ignore - OpenAI client
451
+ const response = await this.model.embeddings.create({
452
+ model: this.modelName,
453
+ input: text
454
+ });
455
+
456
+ const embedding = response.data[0].embedding;
457
+ return embedding;
458
+ } catch (error) {
459
+ const message = error instanceof Error ? error.message : String(error);
460
+ throw new EmbeddingError(
461
+ `Failed to generate OpenAI embedding: ${message}`,
462
+ { modelName: this.modelName, error: message }
463
+ );
464
+ }
465
+ }
466
+
467
+ /**
468
+ * Generate embedding using Cohere API
469
+ * @param {string} text - Text to embed
470
+ * @returns {Promise<number[]>} Embedding vector
471
+ * @private
472
+ */
473
+ async _embedCohere(text) {
474
+ if (!this.model) throw new EmbeddingError('Model not initialized');
475
+ try {
476
+ // @ts-ignore - Cohere client
477
+ const response = await this.model.embed({
478
+ model: this.modelName,
479
+ texts: [text],
480
+ inputType: 'search_document'
481
+ });
482
+
483
+ const embedding = response.embeddings[0];
484
+ return embedding;
485
+ } catch (error) {
486
+ const message = error instanceof Error ? error.message : String(error);
487
+ throw new EmbeddingError(
488
+ `Failed to generate Cohere embedding: ${message}`,
489
+ { modelName: this.modelName, error: message }
490
+ );
491
+ }
492
+ }
493
+
494
+ /**
495
+ * Normalize vector to unit length
496
+ * @param {number[]} vector - Vector to normalize
497
+ * @returns {number[]} Normalized vector
498
+ * @private
499
+ */
500
+ _normalize(vector) {
501
+ // Calculate magnitude
502
+ const magnitude = Math.sqrt(
503
+ vector.reduce((sum, val) => sum + val * val, 0)
504
+ );
505
+
506
+ // Avoid division by zero
507
+ if (magnitude === 0) {
508
+ return vector.map(() => 0);
509
+ }
510
+
511
+ // Normalize
512
+ return vector.map(val => val / magnitude);
513
+ }
514
+
515
+ /**
516
+ * Generate cache key from text
517
+ * @param {string} text - Text to generate key from
518
+ * @returns {string} Cache key
519
+ * @private
520
+ */
521
+ _getCacheKey(text) {
522
+ return crypto
523
+ .createHash('md5')
524
+ .update(text)
525
+ .digest('hex');
526
+ }
527
+
528
+ /**
529
+ * Set cache value with LRU eviction
530
+ * @param {string} key - Cache key
531
+ * @param {number[]} value - Embedding vector
532
+ * @private
533
+ */
534
+ _setCache(key, value) {
535
+ // Evict oldest if at capacity
536
+ if (this.cache.size >= this.cacheMaxSize) {
537
+ const firstKey = this.cache.keys().next().value;
538
+ this.cache.delete(firstKey);
539
+ }
540
+
541
+ this.cache.set(key, value);
542
+ }
543
+
544
+ /**
545
+ * Get service statistics
546
+ * @returns {Object} Statistics object
547
+ */
548
+ getStats() {
549
+ return {
550
+ modelType: this.modelType,
551
+ modelName: this.modelName,
552
+ dimension: this.dimension,
553
+ initialized: this.initialized,
554
+ totalEmbeddings: this.stats.totalEmbeddings,
555
+ cacheHits: this.stats.cacheHits,
556
+ cacheMisses: this.stats.cacheMisses,
557
+ cacheSize: this.cache.size,
558
+ cacheMaxSize: this.cacheMaxSize,
559
+ cacheHitRate: this.stats.cacheHits / (this.stats.cacheHits + this.stats.cacheMisses) || 0,
560
+ batchCount: this.stats.batchCount,
561
+ batchSize: this.batchSize,
562
+ normalize: this.normalize
563
+ };
564
+ }
565
+
566
+ /**
567
+ * Clear the embedding cache
568
+ */
569
+ clearCache() {
570
+ this.cache.clear();
571
+ }
572
+
573
+ /**
574
+ * Reset statistics
575
+ */
576
+ resetStats() {
577
+ this.stats = {
578
+ totalEmbeddings: 0,
579
+ cacheHits: 0,
580
+ cacheMisses: 0,
581
+ batchCount: 0
582
+ };
583
+ }
584
+ }
585
+
586
+ export default EmbeddingService;
package/lib/index.js ADDED
@@ -0,0 +1,18 @@
1
+ // Core module exports
2
+ export * from './lancedb/index.js';
3
+ export * from './embeddings/index.js';
4
+ export * from './search/index.js';
5
+ export * from './privacy/index.js';
6
+ export * from './memory/index.js';
7
+ export * from './scrubber/index.js';
8
+ export {
9
+ HandoffValidator,
10
+ Spinner,
11
+ ProgressBar,
12
+ MultiSpinner,
13
+ StreamingClient,
14
+ StreamingLLM,
15
+ sanitizeErrorForLogging,
16
+ withSanitizedErrors
17
+ } from './utils/index.js';
18
+ export * from './adapters/index.js';