ruvector 0.1.30 → 0.1.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,386 @@
1
+ /**
2
+ * Embedding Service - Unified embedding generation and management
3
+ *
4
+ * This service provides a unified interface for generating, caching, and
5
+ * managing embeddings from various sources (local models, APIs, etc.)
6
+ */
7
+
8
+ /**
9
+ * Embedding provider interface
10
+ */
11
+ export interface EmbeddingProvider {
12
+ /** Provider name */
13
+ name: string;
14
+ /** Generate embeddings for texts */
15
+ embed(texts: string[]): Promise<number[][]>;
16
+ /** Get embedding dimensions */
17
+ getDimensions(): number;
18
+ }
19
+
20
+ /**
21
+ * Cached embedding entry
22
+ */
23
+ interface CacheEntry {
24
+ embedding: number[];
25
+ timestamp: number;
26
+ hits: number;
27
+ }
28
+
29
+ /**
30
+ * Embedding service configuration
31
+ */
32
+ export interface EmbeddingServiceConfig {
33
+ /** Default provider to use */
34
+ defaultProvider?: string;
35
+ /** Maximum cache size */
36
+ maxCacheSize?: number;
37
+ /** Cache TTL in milliseconds */
38
+ cacheTtl?: number;
39
+ /** Batch size for embedding generation */
40
+ batchSize?: number;
41
+ }
42
+
43
+ /**
44
+ * Simple hash function for cache keys
45
+ */
46
+ function hashText(text: string): string {
47
+ let hash = 0;
48
+ for (let i = 0; i < text.length; i++) {
49
+ const char = text.charCodeAt(i);
50
+ hash = ((hash << 5) - hash) + char;
51
+ hash = hash & hash;
52
+ }
53
+ return `h${hash.toString(36)}`;
54
+ }
55
+
56
+ /**
57
+ * Mock embedding provider for testing
58
+ */
59
+ export class MockEmbeddingProvider implements EmbeddingProvider {
60
+ name = 'mock';
61
+ private dimensions: number;
62
+
63
+ constructor(dimensions: number = 384) {
64
+ this.dimensions = dimensions;
65
+ }
66
+
67
+ async embed(texts: string[]): Promise<number[][]> {
68
+ return texts.map(text => {
69
+ // Generate deterministic pseudo-random embeddings based on text
70
+ const embedding: number[] = [];
71
+ let seed = 0;
72
+ for (let i = 0; i < text.length; i++) {
73
+ seed = ((seed << 5) - seed + text.charCodeAt(i)) | 0;
74
+ }
75
+
76
+ for (let i = 0; i < this.dimensions; i++) {
77
+ seed = (seed * 1103515245 + 12345) | 0;
78
+ embedding.push((seed % 1000) / 1000 - 0.5);
79
+ }
80
+
81
+ // Normalize
82
+ const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0));
83
+ return embedding.map(v => v / (norm || 1));
84
+ });
85
+ }
86
+
87
+ getDimensions(): number {
88
+ return this.dimensions;
89
+ }
90
+ }
91
+
92
+ /**
93
+ * Simple local embedding using character n-grams
94
+ * This is a fallback when no external provider is available
95
+ */
96
+ export class LocalNGramProvider implements EmbeddingProvider {
97
+ name = 'local-ngram';
98
+ private dimensions: number;
99
+ private ngramSize: number;
100
+
101
+ constructor(dimensions: number = 256, ngramSize: number = 3) {
102
+ this.dimensions = dimensions;
103
+ this.ngramSize = ngramSize;
104
+ }
105
+
106
+ async embed(texts: string[]): Promise<number[][]> {
107
+ return texts.map(text => this.embedSingle(text));
108
+ }
109
+
110
+ private embedSingle(text: string): number[] {
111
+ const embedding = new Array(this.dimensions).fill(0);
112
+ const normalized = text.toLowerCase().replace(/[^a-z0-9]/g, ' ');
113
+
114
+ // Generate n-grams and hash them into embedding dimensions
115
+ for (let i = 0; i <= normalized.length - this.ngramSize; i++) {
116
+ const ngram = normalized.slice(i, i + this.ngramSize);
117
+ const hash = this.hashNgram(ngram);
118
+ const idx = Math.abs(hash) % this.dimensions;
119
+ embedding[idx] += hash > 0 ? 1 : -1;
120
+ }
121
+
122
+ // Normalize
123
+ const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0));
124
+ return embedding.map(v => v / (norm || 1));
125
+ }
126
+
127
+ private hashNgram(ngram: string): number {
128
+ let hash = 0;
129
+ for (let i = 0; i < ngram.length; i++) {
130
+ hash = ((hash << 5) - hash + ngram.charCodeAt(i)) | 0;
131
+ }
132
+ return hash;
133
+ }
134
+
135
+ getDimensions(): number {
136
+ return this.dimensions;
137
+ }
138
+ }
139
+
140
+ /**
141
+ * Embedding service with caching and batching
142
+ */
143
+ export class EmbeddingService {
144
+ private providers: Map<string, EmbeddingProvider> = new Map();
145
+ private cache: Map<string, CacheEntry> = new Map();
146
+ private config: Required<EmbeddingServiceConfig>;
147
+
148
+ constructor(config: EmbeddingServiceConfig = {}) {
149
+ this.config = {
150
+ defaultProvider: config.defaultProvider ?? 'local-ngram',
151
+ maxCacheSize: config.maxCacheSize ?? 10000,
152
+ cacheTtl: config.cacheTtl ?? 3600000, // 1 hour
153
+ batchSize: config.batchSize ?? 32,
154
+ };
155
+
156
+ // Register default providers
157
+ this.registerProvider(new LocalNGramProvider());
158
+ this.registerProvider(new MockEmbeddingProvider());
159
+ }
160
+
161
+ /**
162
+ * Register an embedding provider
163
+ */
164
+ registerProvider(provider: EmbeddingProvider): void {
165
+ this.providers.set(provider.name, provider);
166
+ }
167
+
168
+ /**
169
+ * Get a registered provider
170
+ */
171
+ getProvider(name?: string): EmbeddingProvider {
172
+ const providerName = name ?? this.config.defaultProvider;
173
+ const provider = this.providers.get(providerName);
174
+ if (!provider) {
175
+ throw new Error(`Provider not found: ${providerName}`);
176
+ }
177
+ return provider;
178
+ }
179
+
180
+ /**
181
+ * Generate embeddings for texts with caching
182
+ *
183
+ * @param texts - Texts to embed
184
+ * @param provider - Provider name (uses default if not specified)
185
+ * @returns Array of embeddings
186
+ */
187
+ async embed(texts: string[], provider?: string): Promise<number[][]> {
188
+ const providerInstance = this.getProvider(provider);
189
+ const providerName = providerInstance.name;
190
+ const now = Date.now();
191
+
192
+ // Check cache and collect texts that need embedding
193
+ const results: (number[] | null)[] = new Array(texts.length).fill(null);
194
+ const uncachedIndices: number[] = [];
195
+ const uncachedTexts: string[] = [];
196
+
197
+ for (let i = 0; i < texts.length; i++) {
198
+ const cacheKey = `${providerName}:${hashText(texts[i])}`;
199
+ const cached = this.cache.get(cacheKey);
200
+
201
+ if (cached && now - cached.timestamp < this.config.cacheTtl) {
202
+ results[i] = cached.embedding;
203
+ cached.hits++;
204
+ } else {
205
+ uncachedIndices.push(i);
206
+ uncachedTexts.push(texts[i]);
207
+ }
208
+ }
209
+
210
+ // Generate embeddings for uncached texts in batches
211
+ if (uncachedTexts.length > 0) {
212
+ const batches: string[][] = [];
213
+ for (let i = 0; i < uncachedTexts.length; i += this.config.batchSize) {
214
+ batches.push(uncachedTexts.slice(i, i + this.config.batchSize));
215
+ }
216
+
217
+ let batchOffset = 0;
218
+ for (const batch of batches) {
219
+ const embeddings = await providerInstance.embed(batch);
220
+
221
+ for (let j = 0; j < embeddings.length; j++) {
222
+ const originalIndex = uncachedIndices[batchOffset + j];
223
+ results[originalIndex] = embeddings[j];
224
+
225
+ // Cache the result
226
+ const cacheKey = `${providerName}:${hashText(texts[originalIndex])}`;
227
+ this.addToCache(cacheKey, embeddings[j], now);
228
+ }
229
+
230
+ batchOffset += batch.length;
231
+ }
232
+ }
233
+
234
+ return results as number[][];
235
+ }
236
+
237
+ /**
238
+ * Generate a single embedding
239
+ */
240
+ async embedOne(text: string, provider?: string): Promise<number[]> {
241
+ const results = await this.embed([text], provider);
242
+ return results[0];
243
+ }
244
+
245
+ /**
246
+ * Add entry to cache with LRU eviction
247
+ */
248
+ private addToCache(key: string, embedding: number[], timestamp: number): void {
249
+ // Evict old entries if cache is full
250
+ if (this.cache.size >= this.config.maxCacheSize) {
251
+ // Find and remove least recently used entry
252
+ let oldestKey = '';
253
+ let oldestTime = Infinity;
254
+ let lowestHits = Infinity;
255
+
256
+ for (const [k, v] of this.cache.entries()) {
257
+ if (v.hits < lowestHits || (v.hits === lowestHits && v.timestamp < oldestTime)) {
258
+ oldestKey = k;
259
+ oldestTime = v.timestamp;
260
+ lowestHits = v.hits;
261
+ }
262
+ }
263
+
264
+ if (oldestKey) {
265
+ this.cache.delete(oldestKey);
266
+ }
267
+ }
268
+
269
+ this.cache.set(key, { embedding, timestamp, hits: 0 });
270
+ }
271
+
272
+ /**
273
+ * Compute cosine similarity between two embeddings
274
+ */
275
+ cosineSimilarity(a: number[], b: number[]): number {
276
+ if (a.length !== b.length) {
277
+ throw new Error('Embeddings must have same dimensions');
278
+ }
279
+
280
+ let dotProduct = 0;
281
+ let normA = 0;
282
+ let normB = 0;
283
+
284
+ for (let i = 0; i < a.length; i++) {
285
+ dotProduct += a[i] * b[i];
286
+ normA += a[i] * a[i];
287
+ normB += b[i] * b[i];
288
+ }
289
+
290
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
291
+ return denom === 0 ? 0 : dotProduct / denom;
292
+ }
293
+
294
+ /**
295
+ * Find most similar texts from a corpus
296
+ */
297
+ async findSimilar(
298
+ query: string,
299
+ corpus: string[],
300
+ k: number = 5,
301
+ provider?: string
302
+ ): Promise<{ text: string; similarity: number; index: number }[]> {
303
+ const [queryEmbed, ...corpusEmbeds] = await this.embed([query, ...corpus], provider);
304
+
305
+ const results = corpusEmbeds.map((embed, i) => ({
306
+ text: corpus[i],
307
+ similarity: this.cosineSimilarity(queryEmbed, embed),
308
+ index: i,
309
+ }));
310
+
311
+ return results
312
+ .sort((a, b) => b.similarity - a.similarity)
313
+ .slice(0, k);
314
+ }
315
+
316
+ /**
317
+ * Get cache statistics
318
+ */
319
+ getCacheStats(): {
320
+ size: number;
321
+ maxSize: number;
322
+ hitRate: number;
323
+ } {
324
+ let totalHits = 0;
325
+ for (const entry of this.cache.values()) {
326
+ totalHits += entry.hits;
327
+ }
328
+
329
+ return {
330
+ size: this.cache.size,
331
+ maxSize: this.config.maxCacheSize,
332
+ hitRate: this.cache.size > 0 ? totalHits / this.cache.size : 0,
333
+ };
334
+ }
335
+
336
+ /**
337
+ * Clear the cache
338
+ */
339
+ clearCache(): void {
340
+ this.cache.clear();
341
+ }
342
+
343
+ /**
344
+ * Get embedding dimensions for a provider
345
+ */
346
+ getDimensions(provider?: string): number {
347
+ return this.getProvider(provider).getDimensions();
348
+ }
349
+
350
+ /**
351
+ * List available providers
352
+ */
353
+ listProviders(): string[] {
354
+ return Array.from(this.providers.keys());
355
+ }
356
+ }
357
+
358
+ /**
359
+ * Create an embedding service instance
360
+ */
361
+ export function createEmbeddingService(
362
+ config?: EmbeddingServiceConfig
363
+ ): EmbeddingService {
364
+ return new EmbeddingService(config);
365
+ }
366
+
367
+ // Singleton instance
368
+ let defaultService: EmbeddingService | null = null;
369
+
370
+ /**
371
+ * Get the default embedding service instance
372
+ */
373
+ export function getDefaultEmbeddingService(): EmbeddingService {
374
+ if (!defaultService) {
375
+ defaultService = new EmbeddingService();
376
+ }
377
+ return defaultService;
378
+ }
379
+
380
+ export default {
381
+ EmbeddingService,
382
+ LocalNGramProvider,
383
+ MockEmbeddingProvider,
384
+ createEmbeddingService,
385
+ getDefaultEmbeddingService,
386
+ };
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Services module exports
3
+ */
4
+
5
+ export * from './embedding-service';
6
+ export { default as embeddingService } from './embedding-service';
package/src/types.ts ADDED
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Vector entry representing a document with its embedding
3
+ */
4
+ export interface VectorEntry {
5
+ /** Unique identifier for the vector */
6
+ id: string;
7
+ /** Vector embedding (array of floats) */
8
+ vector: number[];
9
+ /** Optional metadata associated with the vector */
10
+ metadata?: Record<string, any>;
11
+ }
12
+
13
+ /**
14
+ * Search query parameters
15
+ */
16
+ export interface SearchQuery {
17
+ /** Query vector to search for */
18
+ vector: number[];
19
+ /** Number of results to return */
20
+ k?: number;
21
+ /** Optional metadata filters */
22
+ filter?: Record<string, any>;
23
+ /** Minimum similarity threshold (0-1) */
24
+ threshold?: number;
25
+ }
26
+
27
+ /**
28
+ * Search result containing matched vector and similarity score
29
+ */
30
+ export interface SearchResult {
31
+ /** ID of the matched vector */
32
+ id: string;
33
+ /** Similarity score (0-1, higher is better) */
34
+ score: number;
35
+ /** Vector data */
36
+ vector: number[];
37
+ /** Associated metadata */
38
+ metadata?: Record<string, any>;
39
+ }
40
+
41
+ /**
42
+ * Database configuration options
43
+ */
44
+ export interface DbOptions {
45
+ /** Vector dimension size */
46
+ dimension: number;
47
+ /** Distance metric to use */
48
+ metric?: 'cosine' | 'euclidean' | 'dot';
49
+ /** Path to persist database */
50
+ path?: string;
51
+ /** Enable auto-persistence */
52
+ autoPersist?: boolean;
53
+ /** HNSW index parameters */
54
+ hnsw?: {
55
+ /** Maximum number of connections per layer */
56
+ m?: number;
57
+ /** Size of the dynamic candidate list */
58
+ efConstruction?: number;
59
+ /** Size of the dynamic candidate list for search */
60
+ efSearch?: number;
61
+ };
62
+ }
63
+
64
+ /**
65
+ * Database statistics
66
+ */
67
+ export interface DbStats {
68
+ /** Total number of vectors */
69
+ count: number;
70
+ /** Vector dimension */
71
+ dimension: number;
72
+ /** Distance metric */
73
+ metric: string;
74
+ /** Memory usage in bytes */
75
+ memoryUsage?: number;
76
+ /** Index type */
77
+ indexType?: string;
78
+ }
79
+
80
+ /**
81
+ * Main VectorDB class interface
82
+ */
83
+ export interface VectorDB {
84
+ /**
85
+ * Create a new vector database
86
+ * @param options Database configuration
87
+ */
88
+ new(options: DbOptions): VectorDB;
89
+
90
+ /**
91
+ * Insert a single vector
92
+ * @param entry Vector entry to insert
93
+ */
94
+ insert(entry: VectorEntry): void;
95
+
96
+ /**
97
+ * Insert multiple vectors in batch
98
+ * @param entries Array of vector entries
99
+ */
100
+ insertBatch(entries: VectorEntry[]): void;
101
+
102
+ /**
103
+ * Search for similar vectors
104
+ * @param query Search query parameters
105
+ * @returns Array of search results
106
+ */
107
+ search(query: SearchQuery): SearchResult[];
108
+
109
+ /**
110
+ * Get vector by ID
111
+ * @param id Vector ID
112
+ * @returns Vector entry or null
113
+ */
114
+ get(id: string): VectorEntry | null;
115
+
116
+ /**
117
+ * Delete vector by ID
118
+ * @param id Vector ID
119
+ * @returns true if deleted, false if not found
120
+ */
121
+ delete(id: string): boolean;
122
+
123
+ /**
124
+ * Update vector metadata
125
+ * @param id Vector ID
126
+ * @param metadata New metadata
127
+ */
128
+ updateMetadata(id: string, metadata: Record<string, any>): void;
129
+
130
+ /**
131
+ * Get database statistics
132
+ */
133
+ stats(): DbStats;
134
+
135
+ /**
136
+ * Save database to disk
137
+ * @param path Optional path (uses configured path if not provided)
138
+ */
139
+ save(path?: string): void;
140
+
141
+ /**
142
+ * Load database from disk
143
+ * @param path Path to database file
144
+ */
145
+ load(path: string): void;
146
+
147
+ /**
148
+ * Clear all vectors from database
149
+ */
150
+ clear(): void;
151
+
152
+ /**
153
+ * Build HNSW index for faster search
154
+ */
155
+ buildIndex(): void;
156
+
157
+ /**
158
+ * Optimize database (rebuild indices, compact storage)
159
+ */
160
+ optimize(): void;
161
+ }