@imayuur/contexthub-vector-engine 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +63 -0
- package/dist/index.js +279 -0
- package/package.json +52 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import type { VectorSearchResult, MemoryEntry } from '@imayuur/contexthub-shared-types';
|
|
2
|
+
export type EmbeddingMode = 'local' | 'off' | 'transformers';
|
|
3
|
+
export declare class VectorEngine {
|
|
4
|
+
private embeddingsPath;
|
|
5
|
+
private embeddings;
|
|
6
|
+
private dimension;
|
|
7
|
+
private mode;
|
|
8
|
+
private static extractor;
|
|
9
|
+
constructor(repoPath: string, mode?: EmbeddingMode);
|
|
10
|
+
private loadEmbeddings;
|
|
11
|
+
private saveEmbeddings;
|
|
12
|
+
/**
|
|
13
|
+
* Lazy load the transformers pipeline
|
|
14
|
+
*/
|
|
15
|
+
private getExtractor;
|
|
16
|
+
/**
|
|
17
|
+
* Generate embedding from text using the selected mode
|
|
18
|
+
*/
|
|
19
|
+
generateEmbedding(text: string): Promise<number[]>;
|
|
20
|
+
/**
|
|
21
|
+
* Add embedding for a memory entry
|
|
22
|
+
*/
|
|
23
|
+
addEmbedding(id: string, embedding: number[]): Promise<void>;
|
|
24
|
+
/**
|
|
25
|
+
* Add embedding by generating it from content
|
|
26
|
+
*/
|
|
27
|
+
addEmbeddingForContent(id: string, content: string): Promise<void>;
|
|
28
|
+
/**
|
|
29
|
+
* Search for similar embeddings using cosine similarity
|
|
30
|
+
*/
|
|
31
|
+
searchSimilar(embedding: number[], limit?: number): Promise<VectorSearchResult[]>;
|
|
32
|
+
/**
|
|
33
|
+
* Search for similar text content
|
|
34
|
+
*/
|
|
35
|
+
searchSimilarText(query: string, memories: MemoryEntry[], limit?: number): Promise<VectorSearchResult[]>;
|
|
36
|
+
/**
|
|
37
|
+
* Update an existing embedding
|
|
38
|
+
*/
|
|
39
|
+
updateEmbedding(id: string, embedding: number[]): Promise<void>;
|
|
40
|
+
/**
|
|
41
|
+
* Delete an embedding
|
|
42
|
+
*/
|
|
43
|
+
deleteEmbedding(id: string): Promise<void>;
|
|
44
|
+
/**
|
|
45
|
+
* Calculate cosine similarity between two vectors
|
|
46
|
+
*/
|
|
47
|
+
private cosineSimilarity;
|
|
48
|
+
/**
|
|
49
|
+
* Batch generate embeddings for multiple texts
|
|
50
|
+
*/
|
|
51
|
+
batchGenerateEmbeddings(items: {
|
|
52
|
+
id: string;
|
|
53
|
+
content: string;
|
|
54
|
+
}[]): Promise<void>;
|
|
55
|
+
/**
|
|
56
|
+
* Get embedding count
|
|
57
|
+
*/
|
|
58
|
+
getEmbeddingCount(): number;
|
|
59
|
+
/**
|
|
60
|
+
* Clear all embeddings
|
|
61
|
+
*/
|
|
62
|
+
clearAll(): Promise<void>;
|
|
63
|
+
}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.VectorEngine = void 0;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const crypto = __importStar(require("crypto"));
|
|
40
|
+
// ── Security Constants ────────────────────────────────────────────────────
|
|
41
|
+
const MAX_EMBEDDINGS = 10000;
|
|
42
|
+
const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
|
|
43
|
+
class VectorEngine {
|
|
44
|
+
constructor(repoPath, mode = 'local') {
|
|
45
|
+
this.mode = mode;
|
|
46
|
+
this.dimension = mode === 'transformers' ? 384 : 1536;
|
|
47
|
+
this.embeddingsPath = path.join(repoPath, '.contexthub', 'embeddings');
|
|
48
|
+
this.embeddings = this.loadEmbeddings();
|
|
49
|
+
}
|
|
50
|
+
loadEmbeddings() {
|
|
51
|
+
try {
|
|
52
|
+
const indexPath = path.join(this.embeddingsPath, 'index.json');
|
|
53
|
+
if (fs.existsSync(indexPath)) {
|
|
54
|
+
// Security: Check file size before reading
|
|
55
|
+
const stats = fs.statSync(indexPath);
|
|
56
|
+
if (stats.size > MAX_FILE_SIZE) {
|
|
57
|
+
console.error('Embeddings file too large, starting fresh');
|
|
58
|
+
return {};
|
|
59
|
+
}
|
|
60
|
+
return JSON.parse(fs.readFileSync(indexPath, 'utf-8'));
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
catch (e) {
|
|
64
|
+
console.error('Failed to load embeddings:', e?.message || 'unknown error');
|
|
65
|
+
}
|
|
66
|
+
return {};
|
|
67
|
+
}
|
|
68
|
+
saveEmbeddings() {
|
|
69
|
+
// Security: Cap embedding count
|
|
70
|
+
const keys = Object.keys(this.embeddings);
|
|
71
|
+
if (keys.length > MAX_EMBEDDINGS) {
|
|
72
|
+
// Remove oldest entries (by key insertion order)
|
|
73
|
+
const toRemove = keys.slice(0, keys.length - MAX_EMBEDDINGS);
|
|
74
|
+
for (const key of toRemove) {
|
|
75
|
+
delete this.embeddings[key];
|
|
76
|
+
}
|
|
77
|
+
console.error(`Embedding store capped at ${MAX_EMBEDDINGS} entries`);
|
|
78
|
+
}
|
|
79
|
+
if (!fs.existsSync(this.embeddingsPath)) {
|
|
80
|
+
fs.mkdirSync(this.embeddingsPath, { recursive: true });
|
|
81
|
+
}
|
|
82
|
+
// Atomic write: write to tmp, then rename
|
|
83
|
+
const indexPath = path.join(this.embeddingsPath, 'index.json');
|
|
84
|
+
const tmpPath = indexPath + `.tmp.${crypto.randomBytes(4).toString('hex')}`;
|
|
85
|
+
try {
|
|
86
|
+
fs.writeFileSync(tmpPath, JSON.stringify(this.embeddings, null, 2), { mode: 0o600 });
|
|
87
|
+
fs.renameSync(tmpPath, indexPath);
|
|
88
|
+
}
|
|
89
|
+
catch (e) {
|
|
90
|
+
try {
|
|
91
|
+
fs.unlinkSync(tmpPath);
|
|
92
|
+
}
|
|
93
|
+
catch { /* ignore */ }
|
|
94
|
+
throw e;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Lazy load the transformers pipeline
|
|
99
|
+
*/
|
|
100
|
+
async getExtractor() {
|
|
101
|
+
if (!VectorEngine.extractor) {
|
|
102
|
+
try {
|
|
103
|
+
// Use dynamic import to avoid static dependency and bundle bloat
|
|
104
|
+
const transformers = await Function('return import("@xenova/transformers")')();
|
|
105
|
+
// Setup cache dir within .contexthub if we want, or rely on default
|
|
106
|
+
VectorEngine.extractor = await transformers.pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
|
|
107
|
+
}
|
|
108
|
+
catch (e) {
|
|
109
|
+
throw new Error('Failed to load @xenova/transformers. Install it with: npm install @xenova/transformers\n' + e.message);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return VectorEngine.extractor;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Generate embedding from text using the selected mode
|
|
116
|
+
*/
|
|
117
|
+
async generateEmbedding(text) {
|
|
118
|
+
if (this.mode === 'off') {
|
|
119
|
+
return [];
|
|
120
|
+
}
|
|
121
|
+
if (this.mode === 'transformers') {
|
|
122
|
+
const extractor = await this.getExtractor();
|
|
123
|
+
const output = await extractor(text, { pooling: 'mean', normalize: true });
|
|
124
|
+
return Array.from(output.data);
|
|
125
|
+
}
|
|
126
|
+
// Local mode: Hash + Bigram TF weighting
|
|
127
|
+
const words = text.toLowerCase().split(/\W+/).filter(w => w.length > 2);
|
|
128
|
+
const termFreqs = new Map();
|
|
129
|
+
// Unigrams
|
|
130
|
+
for (const word of words) {
|
|
131
|
+
termFreqs.set(word, (termFreqs.get(word) || 0) + 1);
|
|
132
|
+
}
|
|
133
|
+
// Bigrams
|
|
134
|
+
for (let i = 0; i < words.length - 1; i++) {
|
|
135
|
+
const bigram = `${words[i]} ${words[i + 1]}`;
|
|
136
|
+
termFreqs.set(bigram, (termFreqs.get(bigram) || 0) + 1);
|
|
137
|
+
}
|
|
138
|
+
const embedding = new Array(this.dimension).fill(0);
|
|
139
|
+
for (const [term, freq] of termFreqs.entries()) {
|
|
140
|
+
let hash = 0;
|
|
141
|
+
for (let i = 0; i < term.length; i++) {
|
|
142
|
+
hash = ((hash << 5) - hash) + term.charCodeAt(i);
|
|
143
|
+
hash = hash & hash;
|
|
144
|
+
}
|
|
145
|
+
const seed = Math.abs(hash);
|
|
146
|
+
const isBigram = term.includes(' ');
|
|
147
|
+
// Spread across 3 positions based on seed
|
|
148
|
+
const pos1 = seed % this.dimension;
|
|
149
|
+
const pos2 = (seed * 17) % this.dimension;
|
|
150
|
+
const pos3 = (seed * 31) % this.dimension;
|
|
151
|
+
// TF weighting: 1 + log10(tf)
|
|
152
|
+
const tfWeight = 1 + Math.log10(freq);
|
|
153
|
+
// Bigrams receive a slight multiplier to emphasize phrase matches
|
|
154
|
+
const weight = tfWeight * (isBigram ? 1.5 : 1.0);
|
|
155
|
+
embedding[pos1] += weight;
|
|
156
|
+
embedding[pos2] += weight * 0.5;
|
|
157
|
+
embedding[pos3] += weight * 0.25;
|
|
158
|
+
}
|
|
159
|
+
// Normalize
|
|
160
|
+
const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0));
|
|
161
|
+
if (magnitude > 0) {
|
|
162
|
+
for (let i = 0; i < embedding.length; i++) {
|
|
163
|
+
embedding[i] /= magnitude;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return embedding;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Add embedding for a memory entry
|
|
170
|
+
*/
|
|
171
|
+
async addEmbedding(id, embedding) {
|
|
172
|
+
this.embeddings[id] = embedding;
|
|
173
|
+
this.saveEmbeddings();
|
|
174
|
+
console.log(`Added embedding for memory ${id} (${embedding.length} dimensions)`);
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Add embedding by generating it from content
|
|
178
|
+
*/
|
|
179
|
+
async addEmbeddingForContent(id, content) {
|
|
180
|
+
const embedding = await this.generateEmbedding(content);
|
|
181
|
+
await this.addEmbedding(id, embedding);
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Search for similar embeddings using cosine similarity
|
|
185
|
+
*/
|
|
186
|
+
async searchSimilar(embedding, limit = 10) {
|
|
187
|
+
const results = [];
|
|
188
|
+
for (const [id, storedEmbedding] of Object.entries(this.embeddings)) {
|
|
189
|
+
const score = this.cosineSimilarity(embedding, storedEmbedding);
|
|
190
|
+
results.push({ id, score, metadata: null });
|
|
191
|
+
}
|
|
192
|
+
// Sort by score descending and limit
|
|
193
|
+
results.sort((a, b) => b.score - a.score);
|
|
194
|
+
return results.slice(0, limit);
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Search for similar text content
|
|
198
|
+
*/
|
|
199
|
+
async searchSimilarText(query, memories, limit = 10) {
|
|
200
|
+
const queryEmbedding = await this.generateEmbedding(query);
|
|
201
|
+
const results = [];
|
|
202
|
+
for (const memory of memories) {
|
|
203
|
+
let embedding = this.embeddings[memory.id];
|
|
204
|
+
if (!embedding && memory.content) {
|
|
205
|
+
embedding = await this.generateEmbedding(memory.content);
|
|
206
|
+
this.embeddings[memory.id] = embedding;
|
|
207
|
+
}
|
|
208
|
+
if (embedding) {
|
|
209
|
+
const score = this.cosineSimilarity(queryEmbedding, embedding);
|
|
210
|
+
results.push({ id: memory.id, score, metadata: memory });
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
this.saveEmbeddings();
|
|
214
|
+
results.sort((a, b) => b.score - a.score);
|
|
215
|
+
return results.slice(0, limit);
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Update an existing embedding
|
|
219
|
+
*/
|
|
220
|
+
async updateEmbedding(id, embedding) {
|
|
221
|
+
this.embeddings[id] = embedding;
|
|
222
|
+
this.saveEmbeddings();
|
|
223
|
+
console.log(`Updated embedding for memory ${id}`);
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Delete an embedding
|
|
227
|
+
*/
|
|
228
|
+
async deleteEmbedding(id) {
|
|
229
|
+
delete this.embeddings[id];
|
|
230
|
+
this.saveEmbeddings();
|
|
231
|
+
console.log(`Deleted embedding for memory ${id}`);
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Calculate cosine similarity between two vectors
|
|
235
|
+
*/
|
|
236
|
+
cosineSimilarity(a, b) {
|
|
237
|
+
if (a.length !== b.length)
|
|
238
|
+
return 0;
|
|
239
|
+
let dotProduct = 0;
|
|
240
|
+
let magnitudeA = 0;
|
|
241
|
+
let magnitudeB = 0;
|
|
242
|
+
for (let i = 0; i < a.length; i++) {
|
|
243
|
+
dotProduct += a[i] * b[i];
|
|
244
|
+
magnitudeA += a[i] * a[i];
|
|
245
|
+
magnitudeB += b[i] * b[i];
|
|
246
|
+
}
|
|
247
|
+
const magnitudeProduct = Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB);
|
|
248
|
+
if (magnitudeProduct === 0)
|
|
249
|
+
return 0;
|
|
250
|
+
return dotProduct / magnitudeProduct;
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Batch generate embeddings for multiple texts
|
|
254
|
+
*/
|
|
255
|
+
async batchGenerateEmbeddings(items) {
|
|
256
|
+
for (const item of items) {
|
|
257
|
+
const embedding = await this.generateEmbedding(item.content);
|
|
258
|
+
this.embeddings[item.id] = embedding;
|
|
259
|
+
}
|
|
260
|
+
this.saveEmbeddings();
|
|
261
|
+
console.log(`Generated embeddings for ${items.length} items`);
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Get embedding count
|
|
265
|
+
*/
|
|
266
|
+
getEmbeddingCount() {
|
|
267
|
+
return Object.keys(this.embeddings).length;
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Clear all embeddings
|
|
271
|
+
*/
|
|
272
|
+
async clearAll() {
|
|
273
|
+
this.embeddings = {};
|
|
274
|
+
this.saveEmbeddings();
|
|
275
|
+
console.log('Cleared all embeddings');
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
exports.VectorEngine = VectorEngine;
|
|
279
|
+
VectorEngine.extractor = null;
|
package/package.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@imayuur/contexthub-vector-engine",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Embeddings and semantic search for ContextHub",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "git+https://github.com/iMayuuR/contexthub.git",
|
|
9
|
+
"directory": "packages/vector-engine"
|
|
10
|
+
},
|
|
11
|
+
"publishConfig": {
|
|
12
|
+
"access": "public"
|
|
13
|
+
},
|
|
14
|
+
"engines": {
|
|
15
|
+
"node": ">=18"
|
|
16
|
+
},
|
|
17
|
+
"main": "dist/index.js",
|
|
18
|
+
"types": "dist/index.d.ts",
|
|
19
|
+
"files": [
|
|
20
|
+
"dist"
|
|
21
|
+
],
|
|
22
|
+
"scripts": {
|
|
23
|
+
"build": "tsc",
|
|
24
|
+
"dev": "tsc --watch",
|
|
25
|
+
"prepublishOnly": "npm run build"
|
|
26
|
+
},
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"@imayuur/contexthub-shared-types": "^1.0.0"
|
|
29
|
+
},
|
|
30
|
+
"devDependencies": {
|
|
31
|
+
"@types/node": "^18.0.0",
|
|
32
|
+
"typescript": "^5.0.0"
|
|
33
|
+
},
|
|
34
|
+
"author": "Mayur Dattatray Patil",
|
|
35
|
+
"bugs": {
|
|
36
|
+
"url": "https://github.com/iMayuuR/contexthub/issues"
|
|
37
|
+
},
|
|
38
|
+
"homepage": "https://github.com/iMayuuR/contexthub#readme",
|
|
39
|
+
"keywords": [
|
|
40
|
+
"contexthub",
|
|
41
|
+
"mcp",
|
|
42
|
+
"ai-memory",
|
|
43
|
+
"cursor",
|
|
44
|
+
"claude"
|
|
45
|
+
],
|
|
46
|
+
"exports": {
|
|
47
|
+
".": {
|
|
48
|
+
"types": "./dist/index.d.ts",
|
|
49
|
+
"default": "./dist/index.js"
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|