@sparkleideas/embeddings 3.0.0-alpha.12-patch.14 → 3.0.0-alpha.12-patch.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,578 @@
1
+ /**
2
+ * RVF Embedding Cache - Pure TypeScript Binary File Cache
3
+ *
4
+ * Replaces the sql.js-based PersistentEmbeddingCache with a lightweight
5
+ * pure-TS binary file format. No native dependencies required.
6
+ *
7
+ * Features:
8
+ * - Map-based in-memory cache with periodic flush to binary file
9
+ * - LRU eviction tracked via access timestamps
10
+ * - TTL support for cache entries
11
+ * - Deterministic FNV-1a text hashing for keys
12
+ * - Binary format: RVEC magic + entry records
13
+ *
14
+ * Binary entry format:
15
+ * [4-byte key-hash][4-byte dims][dims*4 bytes float32][8-byte timestamp][8-byte access-count]
16
+ *
17
+ * @module @sparkleideas/embeddings
18
+ */
19
+
20
+ import { existsSync, mkdirSync, readFileSync, writeFileSync, renameSync } from 'fs';
21
+ import { dirname } from 'path';
22
+
23
+ /** Validate a file path is safe */
24
+ function validatePath(p: string): void {
25
+ if (p.includes('\0')) throw new Error('Cache path contains null bytes');
26
+ }
27
+
28
+ // ============================================================================
29
+ // Configuration
30
+ // ============================================================================
31
+
32
+ /**
33
+ * Configuration for RVF embedding cache
34
+ */
35
+ export interface RvfEmbeddingCacheConfig {
36
+ /** Path to the binary cache file */
37
+ cachePath: string;
38
+ /** Maximum number of entries (default: 10000) */
39
+ maxSize?: number;
40
+ /** TTL in milliseconds (default: 7 days) */
41
+ ttlMs?: number;
42
+ /** Embedding dimensions (used for validation) */
43
+ dimensions?: number;
44
+ }
45
+
46
+ // ============================================================================
47
+ // Cache Entry
48
+ // ============================================================================
49
+
50
+ interface CacheEntry {
51
+ embedding: Float32Array;
52
+ createdAt: number;
53
+ accessedAt: number;
54
+ accessCount: number;
55
+ }
56
+
57
+ // ============================================================================
58
+ // Constants
59
+ // ============================================================================
60
+
61
+ /** Binary file magic bytes: "RVEC" */
62
+ const MAGIC = new Uint8Array([0x52, 0x56, 0x45, 0x43]); // R V E C
63
+
64
+ /** Default TTL: 7 days in milliseconds */
65
+ const DEFAULT_TTL_MS = 7 * 24 * 60 * 60 * 1000;
66
+
67
+ /** Default max entries */
68
+ const DEFAULT_MAX_SIZE = 10000;
69
+
70
+ /** Auto-flush interval: 30 seconds */
71
+ const AUTO_FLUSH_INTERVAL_MS = 30000;
72
+
73
+ /** FNV-1a offset basis (32-bit) */
74
+ const FNV_OFFSET_BASIS = 0x811c9dc5;
75
+
76
+ /** FNV-1a prime (32-bit) */
77
+ const FNV_PRIME = 0x01000193;
78
+
79
+ // ============================================================================
80
+ // RVF Embedding Cache
81
+ // ============================================================================
82
+
83
+ /**
84
+ * Pure-TS binary file embedding cache with LRU eviction and TTL support.
85
+ *
86
+ * Stores embeddings as raw Float32Array bytes keyed by FNV-1a text hashes.
87
+ * Uses an in-memory Map with periodic flush to a compact binary file.
88
+ */
89
+ export class RvfEmbeddingCache {
90
+ private readonly cachePath: string;
91
+ private readonly maxSize: number;
92
+ private readonly ttlMs: number;
93
+ private readonly dimensions: number | undefined;
94
+
95
+ private entries: Map<number, CacheEntry> = new Map();
96
+ private textToHash: Map<string, number> = new Map();
97
+ private dirty = false;
98
+ private flushTimer: ReturnType<typeof setInterval> | null = null;
99
+ private initialized = false;
100
+
101
+ constructor(config: RvfEmbeddingCacheConfig) {
102
+ this.cachePath = config.cachePath;
103
+ this.maxSize = config.maxSize ?? DEFAULT_MAX_SIZE;
104
+ this.ttlMs = config.ttlMs ?? DEFAULT_TTL_MS;
105
+ this.dimensions = config.dimensions;
106
+ validatePath(this.cachePath);
107
+ }
108
+
109
+ // --------------------------------------------------------------------------
110
+ // Initialization
111
+ // --------------------------------------------------------------------------
112
+
113
+ /**
114
+ * Lazily initialize the cache: load from disk if file exists, start auto-flush.
115
+ */
116
+ private async ensureInitialized(): Promise<void> {
117
+ if (this.initialized) return;
118
+
119
+ // Ensure parent directory exists
120
+ const dir = dirname(this.cachePath);
121
+ if (!existsSync(dir)) {
122
+ mkdirSync(dir, { recursive: true });
123
+ }
124
+
125
+ // Load existing cache file
126
+ if (existsSync(this.cachePath)) {
127
+ this.loadFromFile();
128
+ }
129
+
130
+ // Clean expired entries on startup
131
+ this.cleanExpired();
132
+
133
+ // Start auto-flush timer
134
+ this.startAutoFlush();
135
+
136
+ this.initialized = true;
137
+ }
138
+
139
+ // --------------------------------------------------------------------------
140
+ // Public API (matches PersistentEmbeddingCache)
141
+ // --------------------------------------------------------------------------
142
+
143
+ /**
144
+ * Get an embedding from the cache by text.
145
+ * Returns null if not found or expired.
146
+ */
147
+ async get(text: string): Promise<Float32Array | null> {
148
+ await this.ensureInitialized();
149
+
150
+ const hash = this.hashText(text);
151
+ const entry = this.entries.get(hash);
152
+
153
+ if (!entry) {
154
+ return null;
155
+ }
156
+
157
+ // Check TTL
158
+ const now = Date.now();
159
+ if (now - entry.createdAt > this.ttlMs) {
160
+ this.entries.delete(hash);
161
+ this.textToHash.delete(text);
162
+ this.dirty = true;
163
+ return null;
164
+ }
165
+
166
+ // Update LRU tracking
167
+ entry.accessedAt = now;
168
+ entry.accessCount++;
169
+ this.dirty = true;
170
+
171
+ return entry.embedding;
172
+ }
173
+
174
+ /**
175
+ * Store an embedding in the cache.
176
+ * Triggers LRU eviction if the cache exceeds maxSize.
177
+ */
178
+ async set(text: string, embedding: Float32Array): Promise<void> {
179
+ await this.ensureInitialized();
180
+
181
+ // Validate dimensions if configured
182
+ if (this.dimensions !== undefined && embedding.length !== this.dimensions) {
183
+ throw new Error(
184
+ `Dimension mismatch: expected ${this.dimensions}, got ${embedding.length}`
185
+ );
186
+ }
187
+
188
+ const hash = this.hashText(text);
189
+ const now = Date.now();
190
+
191
+ // Copy the embedding to avoid external mutation
192
+ const copy = new Float32Array(embedding.length);
193
+ copy.set(embedding);
194
+
195
+ const existing = this.entries.get(hash);
196
+ if (existing) {
197
+ existing.embedding = copy;
198
+ existing.accessedAt = now;
199
+ existing.accessCount++;
200
+ } else {
201
+ this.entries.set(hash, {
202
+ embedding: copy,
203
+ createdAt: now,
204
+ accessedAt: now,
205
+ accessCount: 1,
206
+ });
207
+ this.textToHash.set(text, hash);
208
+ }
209
+
210
+ this.dirty = true;
211
+
212
+ // Evict if over capacity
213
+ this.evictIfNeeded();
214
+ }
215
+
216
+ /**
217
+ * Check whether the cache contains an embedding for the given text.
218
+ */
219
+ async has(text: string): Promise<boolean> {
220
+ await this.ensureInitialized();
221
+
222
+ const hash = this.hashText(text);
223
+ const entry = this.entries.get(hash);
224
+
225
+ if (!entry) return false;
226
+
227
+ // Check TTL
228
+ if (Date.now() - entry.createdAt > this.ttlMs) {
229
+ this.entries.delete(hash);
230
+ this.textToHash.delete(text);
231
+ this.dirty = true;
232
+ return false;
233
+ }
234
+
235
+ return true;
236
+ }
237
+
238
+ /**
239
+ * Delete a specific entry from the cache.
240
+ * Returns true if the entry existed and was deleted.
241
+ */
242
+ async delete(text: string): Promise<boolean> {
243
+ await this.ensureInitialized();
244
+
245
+ const hash = this.hashText(text);
246
+ const existed = this.entries.delete(hash);
247
+ this.textToHash.delete(text);
248
+
249
+ if (existed) {
250
+ this.dirty = true;
251
+ }
252
+
253
+ return existed;
254
+ }
255
+
256
+ /**
257
+ * Clear all entries from the cache and persist the empty state.
258
+ */
259
+ async clear(): Promise<void> {
260
+ await this.ensureInitialized();
261
+
262
+ this.entries.clear();
263
+ this.textToHash.clear();
264
+ this.dirty = true;
265
+ this.flushToFile();
266
+ }
267
+
268
+ /**
269
+ * Return the number of entries currently in the cache.
270
+ */
271
+ async size(): Promise<number> {
272
+ await this.ensureInitialized();
273
+ return this.entries.size;
274
+ }
275
+
276
+ /**
277
+ * Flush pending changes to disk and stop the auto-flush timer.
278
+ */
279
+ async close(): Promise<void> {
280
+ this.stopAutoFlush();
281
+
282
+ if (this.dirty) {
283
+ this.flushToFile();
284
+ }
285
+
286
+ this.entries.clear();
287
+ this.textToHash.clear();
288
+ this.initialized = false;
289
+ }
290
+
291
+ // --------------------------------------------------------------------------
292
+ // Hashing
293
+ // --------------------------------------------------------------------------
294
+
295
+ /**
296
+ * FNV-1a 32-bit hash of the input text.
297
+ * Deterministic: same input always produces the same hash.
298
+ */
299
+ private hashText(text: string): number {
300
+ let hash = FNV_OFFSET_BASIS;
301
+ for (let i = 0; i < text.length; i++) {
302
+ hash ^= text.charCodeAt(i);
303
+ hash = Math.imul(hash, FNV_PRIME) >>> 0;
304
+ }
305
+ return hash;
306
+ }
307
+
308
+ // --------------------------------------------------------------------------
309
+ // LRU Eviction
310
+ // --------------------------------------------------------------------------
311
+
312
+ /**
313
+ * If the cache exceeds maxSize, evict the least-recently-accessed entries
314
+ * until we are back at 90% capacity.
315
+ */
316
+ private evictIfNeeded(): void {
317
+ if (this.entries.size <= this.maxSize) return;
318
+
319
+ const targetSize = Math.floor(this.maxSize * 0.9);
320
+ const toEvict = this.entries.size - targetSize;
321
+
322
+ // Sort entries by accessedAt ascending (oldest first)
323
+ const sorted = [...this.entries.entries()].sort(
324
+ (a, b) => a[1].accessedAt - b[1].accessedAt
325
+ );
326
+
327
+ // Build reverse map for O(1) lookup (hash → text)
328
+ const hashToText = new Map<number, string>();
329
+ for (const [text, hash] of this.textToHash) {
330
+ hashToText.set(hash, text);
331
+ }
332
+
333
+ for (let i = 0; i < toEvict && i < sorted.length; i++) {
334
+ const [hash] = sorted[i];
335
+ this.entries.delete(hash);
336
+ const text = hashToText.get(hash);
337
+ if (text !== undefined) this.textToHash.delete(text);
338
+ }
339
+
340
+ this.dirty = true;
341
+ }
342
+
343
+ // --------------------------------------------------------------------------
344
+ // TTL Cleanup
345
+ // --------------------------------------------------------------------------
346
+
347
+ /**
348
+ * Remove all entries whose createdAt timestamp is older than TTL.
349
+ */
350
+ private cleanExpired(): void {
351
+ const cutoff = Date.now() - this.ttlMs;
352
+ const toDelete: number[] = [];
353
+
354
+ for (const [hash, entry] of this.entries) {
355
+ if (entry.createdAt < cutoff) {
356
+ toDelete.push(hash);
357
+ }
358
+ }
359
+
360
+ // Build reverse map for O(1) lookup
361
+ const hashToText = new Map<number, string>();
362
+ for (const [text, h] of this.textToHash) {
363
+ hashToText.set(h, text);
364
+ }
365
+
366
+ for (const hash of toDelete) {
367
+ this.entries.delete(hash);
368
+ const text = hashToText.get(hash);
369
+ if (text !== undefined) this.textToHash.delete(text);
370
+ }
371
+
372
+ if (toDelete.length > 0) {
373
+ this.dirty = true;
374
+ }
375
+ }
376
+
377
+ // --------------------------------------------------------------------------
378
+ // Auto-Flush Timer
379
+ // --------------------------------------------------------------------------
380
+
381
+ private startAutoFlush(): void {
382
+ if (this.flushTimer) return;
383
+
384
+ this.flushTimer = setInterval(() => {
385
+ if (this.dirty) {
386
+ this.flushToFile();
387
+ }
388
+ }, AUTO_FLUSH_INTERVAL_MS);
389
+ if (this.flushTimer.unref) this.flushTimer.unref();
390
+ }
391
+
392
+ private stopAutoFlush(): void {
393
+ if (this.flushTimer) {
394
+ clearInterval(this.flushTimer);
395
+ this.flushTimer = null;
396
+ }
397
+ }
398
+
399
+ // --------------------------------------------------------------------------
400
+ // Binary Serialization
401
+ // --------------------------------------------------------------------------
402
+
403
+ /**
404
+ * Write all entries to the binary cache file.
405
+ *
406
+ * Format:
407
+ * [4-byte magic "RVEC"]
408
+ * For each entry:
409
+ * [4-byte key-hash (uint32)]
410
+ * [4-byte dims (uint32)]
411
+ * [dims * 4 bytes float32 data]
412
+ * [8-byte createdAt (float64, used as timestamp)]
413
+ * [8-byte accessCount (float64)]
414
+ */
415
+ private flushToFile(): void {
416
+ try {
417
+ // Version 2 format: magic(4) + version(4) + entries...
418
+ // Entry: hash(4) + dims(4) + embedding(dims*4) + createdAt(8) + accessedAt(8) + accessCount(8)
419
+ let totalSize = MAGIC.length + 4; // magic + version uint32
420
+ for (const [, entry] of this.entries) {
421
+ totalSize += 4 + 4 + entry.embedding.length * 4 + 8 + 8 + 8;
422
+ }
423
+
424
+ const buffer = new ArrayBuffer(totalSize);
425
+ const view = new DataView(buffer);
426
+ const bytes = new Uint8Array(buffer);
427
+ let offset = 0;
428
+
429
+ // Write magic
430
+ bytes.set(MAGIC, 0);
431
+ offset += MAGIC.length;
432
+
433
+ // Write format version
434
+ view.setUint32(offset, 2, true);
435
+ offset += 4;
436
+
437
+ // Write entries
438
+ for (const [hash, entry] of this.entries) {
439
+ // Key hash (uint32, little-endian)
440
+ view.setUint32(offset, hash, true);
441
+ offset += 4;
442
+
443
+ // Dimensions (uint32, little-endian)
444
+ view.setUint32(offset, entry.embedding.length, true);
445
+ offset += 4;
446
+
447
+ // Embedding data (float32 array, little-endian)
448
+ for (let i = 0; i < entry.embedding.length; i++) {
449
+ view.setFloat32(offset, entry.embedding[i], true);
450
+ offset += 4;
451
+ }
452
+
453
+ // createdAt as float64 (little-endian) - v2: separate from accessedAt
454
+ view.setFloat64(offset, entry.createdAt, true);
455
+ offset += 8;
456
+
457
+ // accessedAt as float64 (little-endian)
458
+ view.setFloat64(offset, entry.accessedAt, true);
459
+ offset += 8;
460
+
461
+ // Access count as float64 (little-endian)
462
+ view.setFloat64(offset, entry.accessCount, true);
463
+ offset += 8;
464
+ }
465
+
466
+ // Ensure parent directory exists
467
+ const dir = dirname(this.cachePath);
468
+ if (!existsSync(dir)) {
469
+ mkdirSync(dir, { recursive: true });
470
+ }
471
+
472
+ const tmpSuffix = Date.now().toString(36) + Math.random().toString(36).slice(2, 8);
473
+ const tmpPath = this.cachePath + '.tmp.' + tmpSuffix;
474
+ writeFileSync(tmpPath, Buffer.from(buffer));
475
+ renameSync(tmpPath, this.cachePath);
476
+ this.dirty = false;
477
+ } catch (error) {
478
+ console.error(
479
+ '[rvf-embedding-cache] Flush error:',
480
+ error instanceof Error ? error.message : error
481
+ );
482
+ }
483
+ }
484
+
485
+ /**
486
+ * Load entries from the binary cache file.
487
+ */
488
+ private loadFromFile(): void {
489
+ try {
490
+ const fileBuffer = readFileSync(this.cachePath);
491
+ if (fileBuffer.length < MAGIC.length) return;
492
+
493
+ const buffer = fileBuffer.buffer.slice(
494
+ fileBuffer.byteOffset,
495
+ fileBuffer.byteOffset + fileBuffer.byteLength
496
+ );
497
+ const view = new DataView(buffer);
498
+ const bytes = new Uint8Array(buffer);
499
+ let offset = 0;
500
+
501
+ // Verify magic
502
+ for (let i = 0; i < MAGIC.length; i++) {
503
+ if (bytes[offset + i] !== MAGIC[i]) {
504
+ console.warn('[rvf-embedding-cache] Invalid magic bytes, skipping load');
505
+ return;
506
+ }
507
+ }
508
+ offset += MAGIC.length;
509
+
510
+ // Check for version header (v2+)
511
+ let formatVersion = 1;
512
+ if (offset + 4 <= buffer.byteLength) {
513
+ const possibleVersion = view.getUint32(offset, true);
514
+ if (possibleVersion === 2) {
515
+ formatVersion = possibleVersion;
516
+ offset += 4;
517
+ }
518
+ }
519
+
520
+ // Read entries
521
+ while (offset + 8 <= buffer.byteLength) {
522
+ // Need at least 4 (hash) + 4 (dims) = 8 bytes for the header
523
+ const hash = view.getUint32(offset, true);
524
+ offset += 4;
525
+
526
+ const dims = view.getUint32(offset, true);
527
+ offset += 4;
528
+
529
+ const entryDataSize = formatVersion === 2
530
+ ? dims * 4 + 8 + 8 + 8 // v2: embedding + createdAt + accessedAt + accessCount
531
+ : dims * 4 + 8 + 8; // v1: embedding + accessedAt + accessCount
532
+ if (offset + entryDataSize > buffer.byteLength) {
533
+ console.warn('[rvf-embedding-cache] Truncated entry, stopping load');
534
+ break;
535
+ }
536
+
537
+ // Read embedding
538
+ const embedding = new Float32Array(dims);
539
+ for (let i = 0; i < dims; i++) {
540
+ embedding[i] = view.getFloat32(offset, true);
541
+ offset += 4;
542
+ }
543
+
544
+ let createdAt: number;
545
+ let accessedAt: number;
546
+ let accessCount: number;
547
+
548
+ if (formatVersion >= 2) {
549
+ createdAt = view.getFloat64(offset, true);
550
+ offset += 8;
551
+ accessedAt = view.getFloat64(offset, true);
552
+ offset += 8;
553
+ accessCount = view.getFloat64(offset, true);
554
+ offset += 8;
555
+ } else {
556
+ // v1: only accessedAt was stored, use it as createdAt too
557
+ accessedAt = view.getFloat64(offset, true);
558
+ offset += 8;
559
+ accessCount = view.getFloat64(offset, true);
560
+ offset += 8;
561
+ createdAt = accessedAt;
562
+ }
563
+
564
+ this.entries.set(hash, {
565
+ embedding,
566
+ createdAt,
567
+ accessedAt,
568
+ accessCount,
569
+ });
570
+ }
571
+ } catch (error) {
572
+ console.warn(
573
+ '[rvf-embedding-cache] Load error:',
574
+ error instanceof Error ? error.message : error
575
+ );
576
+ }
577
+ }
578
+ }