@sparkleideas/embeddings 3.0.0-alpha.12-patch.14 → 3.0.0-alpha.12-patch.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,377 @@
1
+ /**
2
+ * RVF Embedding Service - Lightweight Hash-Based Embeddings
3
+ *
4
+ * Provides deterministic, sub-millisecond embedding generation using
5
+ * FNV-1a hash-based vectors. No neural model or external API required.
6
+ *
7
+ * Features:
8
+ * - Deterministic: same input always produces the same embedding
9
+ * - FNV-1a hash seeding with multi-round mixing
10
+ * - L2-normalized output vectors
11
+ * - Sub-millisecond generation (<0.1ms typical)
12
+ * - RvfEmbeddingCache for binary file persistence
13
+ * - Zero external dependencies
14
+ *
15
+ * Use cases:
16
+ * - Fast similarity search where relative distances matter more than semantics
17
+ * - Development and testing without API keys
18
+ * - Offline environments without neural model access
19
+ * - Bootstrapping before heavier providers are available
20
+ *
21
+ * @module @sparkleideas/embeddings
22
+ */
23
+
24
+ import { EventEmitter } from 'events';
25
+ import type {
26
+ EmbeddingProvider,
27
+ EmbeddingResult,
28
+ BatchEmbeddingResult,
29
+ IEmbeddingService,
30
+ EmbeddingEvent,
31
+ EmbeddingEventListener,
32
+ NormalizationType,
33
+ RvfEmbeddingConfig,
34
+ } from './types.js';
35
+ import { normalize } from './normalization.js';
36
+ import { RvfEmbeddingCache } from './rvf-embedding-cache.js';
37
+
38
+ // ============================================================================
39
+ // Constants
40
+ // ============================================================================
41
+
42
+ /** FNV-1a offset basis (32-bit) */
43
+ const FNV_OFFSET_BASIS = 0x811c9dc5;
44
+
45
+ /** FNV-1a prime (32-bit) */
46
+ const FNV_PRIME = 0x01000193;
47
+
48
+ /** Default embedding dimensions */
49
+ const DEFAULT_DIMENSIONS = 384;
50
+
51
+ /** Default in-memory LRU cache size */
52
+ const DEFAULT_CACHE_SIZE = 1000;
53
+
54
+ // ============================================================================
55
+ // LRU Cache (lightweight in-memory)
56
+ // ============================================================================
57
+
58
+ class LRUCache<K, V> {
59
+ private cache: Map<K, V> = new Map();
60
+ private hits = 0;
61
+ private misses = 0;
62
+
63
+ constructor(private readonly maxSize: number) {}
64
+
65
+ get(key: K): V | undefined {
66
+ const value = this.cache.get(key);
67
+ if (value !== undefined) {
68
+ // Move to end (most recently used)
69
+ this.cache.delete(key);
70
+ this.cache.set(key, value);
71
+ this.hits++;
72
+ return value;
73
+ }
74
+ this.misses++;
75
+ return undefined;
76
+ }
77
+
78
+ set(key: K, value: V): void {
79
+ if (this.cache.has(key)) {
80
+ this.cache.delete(key);
81
+ } else if (this.cache.size >= this.maxSize) {
82
+ const firstKey = this.cache.keys().next().value;
83
+ if (firstKey !== undefined) {
84
+ this.cache.delete(firstKey);
85
+ }
86
+ }
87
+ this.cache.set(key, value);
88
+ }
89
+
90
+ clear(): void {
91
+ this.cache.clear();
92
+ this.hits = 0;
93
+ this.misses = 0;
94
+ }
95
+
96
+ get size(): number {
97
+ return this.cache.size;
98
+ }
99
+
100
+ get hitRate(): number {
101
+ const total = this.hits + this.misses;
102
+ return total > 0 ? this.hits / total : 0;
103
+ }
104
+
105
+ getStats() {
106
+ return {
107
+ size: this.cache.size,
108
+ maxSize: this.maxSize,
109
+ hits: this.hits,
110
+ misses: this.misses,
111
+ hitRate: this.hitRate,
112
+ };
113
+ }
114
+ }
115
+
116
+ // ============================================================================
117
+ // RVF Embedding Service
118
+ // ============================================================================
119
+
120
+ /**
121
+ * Lightweight hash-based embedding service.
122
+ *
123
+ * Generates deterministic embeddings from text using FNV-1a hashing
124
+ * with multi-round mixing and L2 normalization. The output is a unit
125
+ * vector in R^n where n = configured dimensions (default 384).
126
+ *
127
+ * Extends EventEmitter and implements IEmbeddingService for drop-in
128
+ * compatibility with other providers.
129
+ */
130
+ export class RvfEmbeddingService extends EventEmitter implements IEmbeddingService {
131
+ readonly provider: EmbeddingProvider = 'rvf';
132
+
133
+ private readonly dimensions: number;
134
+ private readonly cache: LRUCache<string, Float32Array>;
135
+ private readonly normalizationType: NormalizationType;
136
+ private readonly embeddingListeners: Set<EmbeddingEventListener> = new Set();
137
+ private persistentCache: RvfEmbeddingCache | null = null;
138
+
139
+ constructor(config: RvfEmbeddingConfig) {
140
+ super();
141
+ this.dimensions = config.dimensions ?? DEFAULT_DIMENSIONS;
142
+ if (this.dimensions <= 0 || !Number.isInteger(this.dimensions)) {
143
+ throw new Error(`Invalid dimensions: ${this.dimensions}. Must be a positive integer.`);
144
+ }
145
+ this.cache = new LRUCache(config.cacheSize ?? DEFAULT_CACHE_SIZE);
146
+ this.normalizationType = config.normalization ?? 'none';
147
+
148
+ // Initialize persistent RVF cache if a path is provided
149
+ if (config.cachePath) {
150
+ this.persistentCache = new RvfEmbeddingCache({
151
+ cachePath: config.cachePath,
152
+ maxSize: config.cacheSize ?? 10000,
153
+ dimensions: this.dimensions,
154
+ });
155
+ }
156
+ }
157
+
158
+ // --------------------------------------------------------------------------
159
+ // IEmbeddingService Implementation
160
+ // --------------------------------------------------------------------------
161
+
162
+ /**
163
+ * Generate an embedding for a single text string.
164
+ */
165
+ async embed(text: string): Promise<EmbeddingResult> {
166
+ if (typeof text !== 'string') {
167
+ throw new Error('embed() expects a string argument');
168
+ }
169
+
170
+ // Check in-memory cache
171
+ const cached = this.cache.get(text);
172
+ if (cached) {
173
+ this.emitEvent({ type: 'cache_hit', text });
174
+ return { embedding: cached, latencyMs: 0, cached: true };
175
+ }
176
+
177
+ // Check persistent cache
178
+ if (this.persistentCache) {
179
+ const persisted = await this.persistentCache.get(text);
180
+ if (persisted) {
181
+ this.cache.set(text, persisted);
182
+ this.emitEvent({ type: 'cache_hit', text });
183
+ return { embedding: persisted, latencyMs: 0, cached: true, persistentCached: true };
184
+ }
185
+ }
186
+
187
+ this.emitEvent({ type: 'embed_start', text });
188
+ const startTime = performance.now();
189
+
190
+ // Generate deterministic embedding
191
+ const embedding = this.generateHashEmbedding(text);
192
+
193
+ // Apply optional normalization (hash embeddings are already L2-normalized,
194
+ // but the user may want a different normalization)
195
+ const normalized = this.applyNormalization(embedding);
196
+
197
+ // Store in caches
198
+ this.cache.set(text, normalized);
199
+ if (this.persistentCache) {
200
+ await this.persistentCache.set(text, normalized);
201
+ }
202
+
203
+ const latencyMs = performance.now() - startTime;
204
+ this.emitEvent({ type: 'embed_complete', text, latencyMs });
205
+
206
+ return { embedding: normalized, latencyMs };
207
+ }
208
+
209
+ /**
210
+ * Generate embeddings for multiple text strings.
211
+ */
212
+ async embedBatch(texts: string[]): Promise<BatchEmbeddingResult> {
213
+ if (!Array.isArray(texts)) {
214
+ throw new Error('embedBatch() expects an array of strings');
215
+ }
216
+
217
+ this.emitEvent({ type: 'batch_start', count: texts.length });
218
+ const startTime = performance.now();
219
+
220
+ const embeddings: Float32Array[] = [];
221
+ let cacheHits = 0;
222
+
223
+ for (const text of texts) {
224
+ const cached = this.cache.get(text);
225
+ if (cached) {
226
+ embeddings.push(cached);
227
+ cacheHits++;
228
+ this.emitEvent({ type: 'cache_hit', text });
229
+ continue;
230
+ }
231
+
232
+ // Check persistent cache
233
+ if (this.persistentCache) {
234
+ const persisted = await this.persistentCache.get(text);
235
+ if (persisted) {
236
+ this.cache.set(text, persisted);
237
+ embeddings.push(persisted);
238
+ cacheHits++;
239
+ this.emitEvent({ type: 'cache_hit', text });
240
+ continue;
241
+ }
242
+ }
243
+
244
+ const embedding = this.generateHashEmbedding(text);
245
+ const normalized = this.applyNormalization(embedding);
246
+ this.cache.set(text, normalized);
247
+
248
+ if (this.persistentCache) {
249
+ await this.persistentCache.set(text, normalized);
250
+ }
251
+
252
+ embeddings.push(normalized);
253
+ }
254
+
255
+ const totalLatencyMs = performance.now() - startTime;
256
+ this.emitEvent({ type: 'batch_complete', count: texts.length, latencyMs: totalLatencyMs });
257
+
258
+ return {
259
+ embeddings,
260
+ totalLatencyMs,
261
+ avgLatencyMs: totalLatencyMs / texts.length,
262
+ cacheStats: {
263
+ hits: cacheHits,
264
+ misses: texts.length - cacheHits,
265
+ },
266
+ };
267
+ }
268
+
269
+ clearCache(): void {
270
+ const size = this.cache.size;
271
+ this.cache.clear();
272
+ this.emitEvent({ type: 'cache_eviction', size });
273
+ }
274
+
275
+ getCacheStats() {
276
+ const stats = this.cache.getStats();
277
+ return {
278
+ size: stats.size,
279
+ maxSize: stats.maxSize,
280
+ hitRate: stats.hitRate,
281
+ };
282
+ }
283
+
284
+ async shutdown(): Promise<void> {
285
+ this.clearCache();
286
+ this.embeddingListeners.clear();
287
+ if (this.persistentCache) {
288
+ await this.persistentCache.close();
289
+ }
290
+ }
291
+
292
+ // --------------------------------------------------------------------------
293
+ // Event System
294
+ // --------------------------------------------------------------------------
295
+
296
+ addEventListener(listener: EmbeddingEventListener): void {
297
+ this.embeddingListeners.add(listener);
298
+ }
299
+
300
+ removeEventListener(listener: EmbeddingEventListener): void {
301
+ this.embeddingListeners.delete(listener);
302
+ }
303
+
304
+ private emitEvent(event: EmbeddingEvent): void {
305
+ for (const listener of this.embeddingListeners) {
306
+ try {
307
+ listener(event);
308
+ } catch (error) {
309
+ console.error('Error in embedding event listener:', error);
310
+ }
311
+ }
312
+ this.emit(event.type, event);
313
+ }
314
+
315
+ // --------------------------------------------------------------------------
316
+ // Hash Embedding Generation
317
+ // --------------------------------------------------------------------------
318
+
319
+ /**
320
+ * Generate a deterministic embedding from text using FNV-1a hashing.
321
+ *
322
+ * Algorithm:
323
+ * 1. Compute a base FNV-1a hash of the full text.
324
+ * 2. For each dimension, derive a unique seed by mixing the base hash
325
+ * with the dimension index using the golden ratio constant.
326
+ * 3. Apply a sine-based pseudo-random transform to spread values.
327
+ * 4. L2-normalize the result to produce a unit vector.
328
+ *
329
+ * This is deterministic: the same text always yields the same vector.
330
+ */
331
+ private generateHashEmbedding(text: string): Float32Array {
332
+ const embedding = new Float32Array(this.dimensions);
333
+
334
+ // Compute base FNV-1a hash of the full text
335
+ let baseHash = FNV_OFFSET_BASIS;
336
+ for (let i = 0; i < text.length; i++) {
337
+ baseHash ^= text.charCodeAt(i);
338
+ baseHash = Math.imul(baseHash, FNV_PRIME) >>> 0;
339
+ }
340
+
341
+ // Generate each dimension from a mixed seed
342
+ for (let i = 0; i < this.dimensions; i++) {
343
+ // Mix dimension index with the base hash using golden ratio constant
344
+ const seed = (baseHash + Math.imul(i, 0x9E3779B9)) >>> 0;
345
+ // Use sine for pseudo-random distribution in [-1, 1] range
346
+ const x = Math.sin(seed) * 43758.5453;
347
+ embedding[i] = x - Math.floor(x); // fractional part in [0, 1)
348
+ // Shift to [-0.5, 0.5) for zero-centered distribution
349
+ embedding[i] -= 0.5;
350
+ }
351
+
352
+ // L2 normalize to unit vector
353
+ let norm = 0;
354
+ for (let i = 0; i < this.dimensions; i++) {
355
+ norm += embedding[i] * embedding[i];
356
+ }
357
+ norm = Math.sqrt(norm);
358
+
359
+ if (norm > 0) {
360
+ for (let i = 0; i < this.dimensions; i++) {
361
+ embedding[i] /= norm;
362
+ }
363
+ }
364
+
365
+ return embedding;
366
+ }
367
+
368
+ /**
369
+ * Apply user-configured normalization if set.
370
+ */
371
+ private applyNormalization(embedding: Float32Array): Float32Array {
372
+ if (this.normalizationType === 'none') {
373
+ return embedding;
374
+ }
375
+ return normalize(embedding, { type: this.normalizationType });
376
+ }
377
+ }
package/src/types.ts CHANGED
@@ -19,7 +19,7 @@
19
19
  /**
20
20
  * Supported embedding providers
21
21
  */
22
- export type EmbeddingProvider = 'openai' | 'transformers' | 'mock' | 'agentic-flow';
22
+ export type EmbeddingProvider = 'openai' | 'transformers' | 'mock' | 'agentic-flow' | 'rvf';
23
23
 
24
24
  /**
25
25
  * Normalization type for embeddings
@@ -144,6 +144,23 @@ export interface AgenticFlowEmbeddingConfig extends EmbeddingBaseConfig {
144
144
  autoDownload?: boolean;
145
145
  }
146
146
 
147
+ /**
148
+ * RVF provider configuration
149
+ * Lightweight hash-based embeddings (no neural model, sub-ms latency)
150
+ */
151
+ export interface RvfEmbeddingConfig extends EmbeddingBaseConfig {
152
+ provider: 'rvf';
153
+
154
+ /** Embedding dimensions (default: 384) */
155
+ dimensions?: number;
156
+
157
+ /** Path to binary cache file for persistent storage */
158
+ cachePath?: string;
159
+
160
+ /** Similarity metric preference (default: 'cosine') */
161
+ metric?: 'cosine' | 'l2' | 'dotproduct';
162
+ }
163
+
147
164
  /**
148
165
  * Union of all provider configs
149
166
  */
@@ -151,7 +168,8 @@ export type EmbeddingConfig =
151
168
  | OpenAIEmbeddingConfig
152
169
  | TransformersEmbeddingConfig
153
170
  | MockEmbeddingConfig
154
- | AgenticFlowEmbeddingConfig;
171
+ | AgenticFlowEmbeddingConfig
172
+ | RvfEmbeddingConfig;
155
173
 
156
174
  // ============================================================================
157
175
  // Result Types