@terronex/aifbin-recall 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTING.md +65 -0
- package/LICENSE +21 -0
- package/NOTICE +36 -0
- package/README.md +250 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +182 -0
- package/dist/cli.js.map +1 -0
- package/dist/db.d.ts +29 -0
- package/dist/db.d.ts.map +1 -0
- package/dist/db.js +252 -0
- package/dist/db.js.map +1 -0
- package/dist/embedder.d.ts +47 -0
- package/dist/embedder.d.ts.map +1 -0
- package/dist/embedder.js +152 -0
- package/dist/embedder.js.map +1 -0
- package/dist/index.d.ts +27 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +45 -0
- package/dist/index.js.map +1 -0
- package/dist/indexer.d.ts +34 -0
- package/dist/indexer.d.ts.map +1 -0
- package/dist/indexer.js +246 -0
- package/dist/indexer.js.map +1 -0
- package/dist/mcp.d.ts +7 -0
- package/dist/mcp.d.ts.map +1 -0
- package/dist/mcp.js +207 -0
- package/dist/mcp.js.map +1 -0
- package/dist/search.d.ts +27 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +159 -0
- package/dist/search.js.map +1 -0
- package/dist/server.d.ts +13 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +250 -0
- package/dist/server.js.map +1 -0
- package/dist/types.d.ts +79 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +20 -0
- package/dist/types.js.map +1 -0
- package/package.json +64 -0
- package/src/cli.ts +195 -0
- package/src/db.ts +295 -0
- package/src/embedder.ts +175 -0
- package/src/index.ts +46 -0
- package/src/indexer.ts +272 -0
- package/src/mcp.ts +244 -0
- package/src/search.ts +201 -0
- package/src/server.ts +270 -0
- package/src/types.ts +103 -0
- package/tsconfig.json +20 -0
package/src/indexer.ts
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AIF-BIN file indexer for AIF-BIN Recall
|
|
3
|
+
* Parses AIF-BIN v2 binary format
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import fs from 'fs';
|
|
7
|
+
import path from 'path';
|
|
8
|
+
import { unpack } from 'msgpackr';
|
|
9
|
+
import type { AifBinFile, AifBinChunk, AifBinHeader, MemoryChunk, IndexOptions } from './types.js';
|
|
10
|
+
import { EngramDB } from './db.js';
|
|
11
|
+
|
|
12
|
+
// AIF-BIN v2 constants
|
|
13
|
+
const MAGIC = Buffer.from([0x41, 0x49, 0x46, 0x42, 0x49, 0x4e, 0x00, 0x01]); // "AIFBIN\x00\x01"
|
|
14
|
+
const HEADER_SIZE = 64;
|
|
15
|
+
const ABSENT_OFFSET = BigInt('0xFFFFFFFFFFFFFFFF');
|
|
16
|
+
|
|
17
|
+
// Chunk types
|
|
18
|
+
enum ChunkType {
|
|
19
|
+
TEXT = 1,
|
|
20
|
+
TABLE_JSON = 2,
|
|
21
|
+
IMAGE = 3,
|
|
22
|
+
AUDIO = 4,
|
|
23
|
+
VIDEO = 5,
|
|
24
|
+
CODE = 6,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Parse an AIF-BIN v2 file
|
|
29
|
+
*/
|
|
30
|
+
export function parseAifBinFile(filePath: string): AifBinFile {
|
|
31
|
+
const buffer = fs.readFileSync(filePath);
|
|
32
|
+
|
|
33
|
+
if (buffer.length < HEADER_SIZE) {
|
|
34
|
+
throw new Error(`File too small: ${filePath}`);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Verify magic bytes
|
|
38
|
+
const magic = buffer.subarray(0, 8);
|
|
39
|
+
if (!magic.equals(MAGIC)) {
|
|
40
|
+
throw new Error(`Invalid AIF-BIN file: bad magic bytes in ${filePath}`);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Parse header (64 bytes)
|
|
44
|
+
const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
45
|
+
|
|
46
|
+
const version = view.getUint32(8, true);
|
|
47
|
+
// padding at 12-15
|
|
48
|
+
const metadataOffset = view.getBigUint64(16, true);
|
|
49
|
+
const originalRawOffset = view.getBigUint64(24, true);
|
|
50
|
+
const contentChunksOffset = view.getBigUint64(32, true);
|
|
51
|
+
const versionsOffset = view.getBigUint64(40, true);
|
|
52
|
+
const footerOffset = view.getBigUint64(48, true);
|
|
53
|
+
const totalSize = view.getBigUint64(56, true);
|
|
54
|
+
|
|
55
|
+
const header: AifBinHeader = {
|
|
56
|
+
magic: new Uint8Array(magic),
|
|
57
|
+
version,
|
|
58
|
+
flags: 0,
|
|
59
|
+
chunkCount: 0,
|
|
60
|
+
embeddingDim: 0,
|
|
61
|
+
createdAt: 0,
|
|
62
|
+
modifiedAt: 0,
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
// Parse metadata section
|
|
66
|
+
let metadata: Record<string, unknown> = {};
|
|
67
|
+
if (metadataOffset !== ABSENT_OFFSET) {
|
|
68
|
+
const metaStart = Number(metadataOffset);
|
|
69
|
+
const metaLength = view.getBigUint64(metaStart, true);
|
|
70
|
+
const metaData = buffer.subarray(metaStart + 8, metaStart + 8 + Number(metaLength));
|
|
71
|
+
try {
|
|
72
|
+
metadata = unpack(metaData) as Record<string, unknown>;
|
|
73
|
+
} catch (e) {
|
|
74
|
+
// Metadata parse failed, continue with empty
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Parse content chunks section
|
|
79
|
+
const chunks: AifBinChunk[] = [];
|
|
80
|
+
if (contentChunksOffset !== ABSENT_OFFSET) {
|
|
81
|
+
const chunksStart = Number(contentChunksOffset);
|
|
82
|
+
const chunkCount = view.getUint32(chunksStart, true);
|
|
83
|
+
header.chunkCount = chunkCount;
|
|
84
|
+
|
|
85
|
+
let offset = chunksStart + 4;
|
|
86
|
+
|
|
87
|
+
for (let i = 0; i < chunkCount; i++) {
|
|
88
|
+
try {
|
|
89
|
+
const chunkType = view.getUint32(offset, true);
|
|
90
|
+
offset += 4;
|
|
91
|
+
|
|
92
|
+
const dataLength = Number(view.getBigUint64(offset, true));
|
|
93
|
+
offset += 8;
|
|
94
|
+
|
|
95
|
+
const metadataLength = Number(view.getBigUint64(offset, true));
|
|
96
|
+
offset += 8;
|
|
97
|
+
|
|
98
|
+
// Parse chunk metadata
|
|
99
|
+
let chunkMeta: Record<string, unknown> = {};
|
|
100
|
+
if (metadataLength > 0) {
|
|
101
|
+
const chunkMetaData = buffer.subarray(offset, offset + metadataLength);
|
|
102
|
+
try {
|
|
103
|
+
chunkMeta = unpack(chunkMetaData) as Record<string, unknown>;
|
|
104
|
+
} catch (e) {
|
|
105
|
+
// Skip bad metadata
|
|
106
|
+
}
|
|
107
|
+
offset += metadataLength;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Parse chunk data
|
|
111
|
+
const chunkData = buffer.subarray(offset, offset + dataLength);
|
|
112
|
+
offset += dataLength;
|
|
113
|
+
|
|
114
|
+
// Extract text content based on chunk type
|
|
115
|
+
let text = '';
|
|
116
|
+
if (chunkType === ChunkType.TEXT || chunkType === ChunkType.CODE) {
|
|
117
|
+
text = chunkData.toString('utf-8');
|
|
118
|
+
} else if (chunkType === ChunkType.TABLE_JSON) {
|
|
119
|
+
try {
|
|
120
|
+
const tableData = JSON.parse(chunkData.toString('utf-8'));
|
|
121
|
+
text = JSON.stringify(tableData);
|
|
122
|
+
} catch {
|
|
123
|
+
text = chunkData.toString('utf-8');
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Extract embedding if present in chunk metadata
|
|
128
|
+
const embedding = (chunkMeta.embedding as number[]) || [];
|
|
129
|
+
if (embedding.length > 0 && header.embeddingDim === 0) {
|
|
130
|
+
header.embeddingDim = embedding.length;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
chunks.push({
|
|
134
|
+
id: (chunkMeta.id as string) || crypto.randomUUID(),
|
|
135
|
+
text,
|
|
136
|
+
embedding,
|
|
137
|
+
metadata: chunkMeta,
|
|
138
|
+
});
|
|
139
|
+
} catch (e) {
|
|
140
|
+
// Skip malformed chunk
|
|
141
|
+
console.error(` Warning: Failed to parse chunk ${i} in ${path.basename(filePath)}`);
|
|
142
|
+
break;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Extract timestamps from metadata if available
|
|
148
|
+
if (metadata.created_at) {
|
|
149
|
+
header.createdAt = new Date(metadata.created_at as string).getTime();
|
|
150
|
+
}
|
|
151
|
+
if (metadata.modified_at) {
|
|
152
|
+
header.modifiedAt = new Date(metadata.modified_at as string).getTime();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
header,
|
|
157
|
+
chunks,
|
|
158
|
+
sourcePath: filePath,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Find all .aif-bin files in a directory
|
|
164
|
+
*/
|
|
165
|
+
export function findAifBinFiles(dir: string, recursive: boolean = true): string[] {
|
|
166
|
+
const files: string[] = [];
|
|
167
|
+
|
|
168
|
+
function scan(currentDir: string): void {
|
|
169
|
+
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
|
|
170
|
+
|
|
171
|
+
for (const entry of entries) {
|
|
172
|
+
const fullPath = path.join(currentDir, entry.name);
|
|
173
|
+
|
|
174
|
+
if (entry.isDirectory() && recursive) {
|
|
175
|
+
scan(fullPath);
|
|
176
|
+
} else if (entry.isFile() && entry.name.endsWith('.aif-bin')) {
|
|
177
|
+
files.push(fullPath);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
scan(dir);
|
|
183
|
+
return files;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
export class Indexer {
|
|
187
|
+
private db: EngramDB;
|
|
188
|
+
|
|
189
|
+
constructor(db: EngramDB) {
|
|
190
|
+
this.db = db;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Index a single AIF-BIN file into a collection
|
|
195
|
+
*/
|
|
196
|
+
indexFile(filePath: string, collectionId: string): number {
|
|
197
|
+
const aifbin = parseAifBinFile(filePath);
|
|
198
|
+
|
|
199
|
+
// Skip files with no chunks or no embeddings
|
|
200
|
+
const chunksWithEmbeddings = aifbin.chunks.filter(c => c.embedding.length > 0);
|
|
201
|
+
if (chunksWithEmbeddings.length === 0) {
|
|
202
|
+
console.log(` Skipped: ${path.basename(filePath)} (no embeddings)`);
|
|
203
|
+
return 0;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Delete existing chunks from this file (for re-indexing)
|
|
207
|
+
this.db.deleteChunksBySource(filePath);
|
|
208
|
+
|
|
209
|
+
// Convert to MemoryChunks and insert
|
|
210
|
+
const chunks: Omit<MemoryChunk, 'createdAt' | 'updatedAt'>[] = chunksWithEmbeddings.map((chunk, index) => ({
|
|
211
|
+
id: chunk.id || crypto.randomUUID(),
|
|
212
|
+
collectionId,
|
|
213
|
+
sourceFile: filePath,
|
|
214
|
+
chunkIndex: index,
|
|
215
|
+
text: chunk.text,
|
|
216
|
+
embedding: chunk.embedding,
|
|
217
|
+
metadata: {
|
|
218
|
+
...chunk.metadata,
|
|
219
|
+
embeddingDim: aifbin.header.embeddingDim,
|
|
220
|
+
originalCreatedAt: aifbin.header.createdAt,
|
|
221
|
+
originalModifiedAt: aifbin.header.modifiedAt,
|
|
222
|
+
},
|
|
223
|
+
}));
|
|
224
|
+
|
|
225
|
+
this.db.insertChunks(chunks);
|
|
226
|
+
return chunks.length;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Index a directory of AIF-BIN files
|
|
231
|
+
*/
|
|
232
|
+
indexDirectory(dir: string, options: IndexOptions): { files: number; chunks: number } {
|
|
233
|
+
const { collection, recursive = true } = options;
|
|
234
|
+
|
|
235
|
+
// Get or create collection
|
|
236
|
+
let col = this.db.getCollection(collection);
|
|
237
|
+
if (!col) {
|
|
238
|
+
col = this.db.createCollection(collection);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Find all .aif-bin files
|
|
242
|
+
const files = findAifBinFiles(dir, recursive);
|
|
243
|
+
|
|
244
|
+
let totalChunks = 0;
|
|
245
|
+
let successFiles = 0;
|
|
246
|
+
|
|
247
|
+
for (const file of files) {
|
|
248
|
+
try {
|
|
249
|
+
const count = this.indexFile(file, col.id);
|
|
250
|
+
if (count > 0) {
|
|
251
|
+
totalChunks += count;
|
|
252
|
+
successFiles++;
|
|
253
|
+
console.log(` Indexed: ${path.basename(file)} (${count} chunks)`);
|
|
254
|
+
}
|
|
255
|
+
} catch (err) {
|
|
256
|
+
console.error(` Failed: ${path.basename(file)} - ${err}`);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Update collection stats
|
|
261
|
+
this.db.updateCollectionStats(col.id);
|
|
262
|
+
|
|
263
|
+
return { files: successFiles, chunks: totalChunks };
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Remove a file from the index
|
|
268
|
+
*/
|
|
269
|
+
removeFile(filePath: string): number {
|
|
270
|
+
return this.db.deleteChunksBySource(filePath);
|
|
271
|
+
}
|
|
272
|
+
}
|
package/src/mcp.ts
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP (Model Context Protocol) server for AIF-BIN Recall
|
|
3
|
+
* Enables AI agents to query semantic memories
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
7
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
8
|
+
import {
|
|
9
|
+
CallToolRequestSchema,
|
|
10
|
+
ListToolsRequestSchema,
|
|
11
|
+
} from '@modelcontextprotocol/sdk/types.js';
|
|
12
|
+
import { EngramDB } from './db.js';
|
|
13
|
+
import { SearchEngine } from './search.js';
|
|
14
|
+
import { Indexer } from './indexer.js';
|
|
15
|
+
import { Embedder, type EmbeddingModelName } from './embedder.js';
|
|
16
|
+
|
|
17
|
+
export async function startMcpServer(db: EngramDB): Promise<void> {
|
|
18
|
+
const search = new SearchEngine(db);
|
|
19
|
+
const indexer = new Indexer(db);
|
|
20
|
+
const embedder = new Embedder('minilm');
|
|
21
|
+
|
|
22
|
+
const server = new Server(
|
|
23
|
+
{
|
|
24
|
+
name: 'aifbin-recall',
|
|
25
|
+
version: '0.1.0',
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
capabilities: {
|
|
29
|
+
tools: {},
|
|
30
|
+
},
|
|
31
|
+
}
|
|
32
|
+
);
|
|
33
|
+
|
|
34
|
+
// List available tools
|
|
35
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
36
|
+
return {
|
|
37
|
+
tools: [
|
|
38
|
+
{
|
|
39
|
+
name: 'recall_search',
|
|
40
|
+
description: 'Search semantic memories using natural language. Automatically embeds your query text. Returns relevant text chunks with similarity scores.',
|
|
41
|
+
inputSchema: {
|
|
42
|
+
type: 'object',
|
|
43
|
+
properties: {
|
|
44
|
+
query: {
|
|
45
|
+
type: 'string',
|
|
46
|
+
description: 'Natural language search query (will be embedded automatically)',
|
|
47
|
+
},
|
|
48
|
+
embedding: {
|
|
49
|
+
type: 'array',
|
|
50
|
+
items: { type: 'number' },
|
|
51
|
+
description: 'Pre-computed query embedding vector (optional, query text is preferred)',
|
|
52
|
+
},
|
|
53
|
+
collection: {
|
|
54
|
+
type: 'string',
|
|
55
|
+
description: 'Collection name to search (optional, searches all if omitted)',
|
|
56
|
+
},
|
|
57
|
+
limit: {
|
|
58
|
+
type: 'number',
|
|
59
|
+
description: 'Maximum results to return (default: 10)',
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
required: ['query'],
|
|
63
|
+
},
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
name: 'recall_get',
|
|
67
|
+
description: 'Retrieve a specific memory chunk by ID',
|
|
68
|
+
inputSchema: {
|
|
69
|
+
type: 'object',
|
|
70
|
+
properties: {
|
|
71
|
+
id: {
|
|
72
|
+
type: 'string',
|
|
73
|
+
description: 'Chunk ID to retrieve',
|
|
74
|
+
},
|
|
75
|
+
},
|
|
76
|
+
required: ['id'],
|
|
77
|
+
},
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
name: 'recall_collections',
|
|
81
|
+
description: 'List all available memory collections',
|
|
82
|
+
inputSchema: {
|
|
83
|
+
type: 'object',
|
|
84
|
+
properties: {},
|
|
85
|
+
},
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
name: 'recall_index',
|
|
89
|
+
description: 'Index a directory of AIF-BIN files into a collection',
|
|
90
|
+
inputSchema: {
|
|
91
|
+
type: 'object',
|
|
92
|
+
properties: {
|
|
93
|
+
path: {
|
|
94
|
+
type: 'string',
|
|
95
|
+
description: 'Directory path containing .aif-bin files',
|
|
96
|
+
},
|
|
97
|
+
collection: {
|
|
98
|
+
type: 'string',
|
|
99
|
+
description: 'Collection name to index into',
|
|
100
|
+
},
|
|
101
|
+
recursive: {
|
|
102
|
+
type: 'boolean',
|
|
103
|
+
description: 'Search subdirectories (default: true)',
|
|
104
|
+
},
|
|
105
|
+
},
|
|
106
|
+
required: ['path', 'collection'],
|
|
107
|
+
},
|
|
108
|
+
},
|
|
109
|
+
],
|
|
110
|
+
};
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
// Handle tool calls
|
|
114
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
115
|
+
const { name, arguments: args } = request.params;
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
switch (name) {
|
|
119
|
+
case 'recall_search': {
|
|
120
|
+
const { embedding, query, collection, limit } = args as {
|
|
121
|
+
embedding?: number[];
|
|
122
|
+
query: string;
|
|
123
|
+
collection?: string;
|
|
124
|
+
limit?: number;
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
if (!query && (!embedding || !Array.isArray(embedding))) {
|
|
128
|
+
return {
|
|
129
|
+
content: [{ type: 'text', text: 'Error: query text required' }],
|
|
130
|
+
isError: true,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Generate embedding from query text if not provided
|
|
135
|
+
let queryEmbedding: number[];
|
|
136
|
+
if (embedding && Array.isArray(embedding)) {
|
|
137
|
+
queryEmbedding = embedding;
|
|
138
|
+
} else {
|
|
139
|
+
queryEmbedding = await embedder.embed(query);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const options = { collection, limit: limit || 10 };
|
|
143
|
+
const results = await search.hybridSearch(queryEmbedding, query, options);
|
|
144
|
+
|
|
145
|
+
const formatted = results.map((r, i) =>
|
|
146
|
+
`[${i + 1}] Score: ${r.score.toFixed(3)}\n` +
|
|
147
|
+
` Source: ${r.chunk.sourceFile}\n` +
|
|
148
|
+
` Text: ${r.chunk.text.slice(0, 500)}${r.chunk.text.length > 500 ? '...' : ''}\n` +
|
|
149
|
+
` ID: ${r.chunk.id}`
|
|
150
|
+
).join('\n\n');
|
|
151
|
+
|
|
152
|
+
return {
|
|
153
|
+
content: [{
|
|
154
|
+
type: 'text',
|
|
155
|
+
text: results.length > 0
|
|
156
|
+
? `Found ${results.length} results:\n\n${formatted}`
|
|
157
|
+
: 'No results found.',
|
|
158
|
+
}],
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
case 'recall_get': {
|
|
163
|
+
const { id } = args as { id: string };
|
|
164
|
+
const chunk = search.recall(id);
|
|
165
|
+
|
|
166
|
+
if (!chunk) {
|
|
167
|
+
return {
|
|
168
|
+
content: [{ type: 'text', text: `Chunk not found: ${id}` }],
|
|
169
|
+
isError: true,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
content: [{
|
|
175
|
+
type: 'text',
|
|
176
|
+
text: `Source: ${chunk.sourceFile}\n` +
|
|
177
|
+
`Chunk: ${chunk.chunkIndex}\n` +
|
|
178
|
+
`Created: ${chunk.createdAt.toISOString()}\n\n` +
|
|
179
|
+
`Text:\n${chunk.text}`,
|
|
180
|
+
}],
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
case 'recall_collections': {
|
|
185
|
+
const collections = db.listCollections();
|
|
186
|
+
|
|
187
|
+
if (collections.length === 0) {
|
|
188
|
+
return {
|
|
189
|
+
content: [{ type: 'text', text: 'No collections found. Use recall_index to create one.' }],
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const formatted = collections.map(c =>
|
|
194
|
+
`• ${c.name}: ${c.chunkCount} chunks from ${c.fileCount} files` +
|
|
195
|
+
(c.description ? ` - ${c.description}` : '')
|
|
196
|
+
).join('\n');
|
|
197
|
+
|
|
198
|
+
return {
|
|
199
|
+
content: [{
|
|
200
|
+
type: 'text',
|
|
201
|
+
text: `Available collections:\n\n${formatted}`,
|
|
202
|
+
}],
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
case 'recall_index': {
|
|
207
|
+
const { path: dirPath, collection, recursive } = args as {
|
|
208
|
+
path: string;
|
|
209
|
+
collection: string;
|
|
210
|
+
recursive?: boolean;
|
|
211
|
+
};
|
|
212
|
+
|
|
213
|
+
const result = indexer.indexDirectory(dirPath, {
|
|
214
|
+
collection,
|
|
215
|
+
recursive: recursive !== false,
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
content: [{
|
|
220
|
+
type: 'text',
|
|
221
|
+
text: `Indexed ${result.files} files (${result.chunks} chunks) into collection "${collection}"`,
|
|
222
|
+
}],
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
default:
|
|
227
|
+
return {
|
|
228
|
+
content: [{ type: 'text', text: `Unknown tool: ${name}` }],
|
|
229
|
+
isError: true,
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
} catch (err) {
|
|
233
|
+
return {
|
|
234
|
+
content: [{ type: 'text', text: `Error: ${err}` }],
|
|
235
|
+
isError: true,
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
// Start the server
|
|
241
|
+
const transport = new StdioServerTransport();
|
|
242
|
+
await server.connect(transport);
|
|
243
|
+
console.error('AIF-BIN Recall MCP server running');
|
|
244
|
+
}
|
package/src/search.ts
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Search functionality for AIF-BIN Recall
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { MemoryChunk, SearchResult, SearchOptions, SearchConfig } from './types.js';
|
|
6
|
+
import { DEFAULT_CONFIG } from './types.js';
|
|
7
|
+
import { EngramDB } from './db.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Calculate cosine similarity between two vectors
|
|
11
|
+
*/
|
|
12
|
+
export function cosineSimilarity(a: number[], b: number[]): number {
|
|
13
|
+
if (a.length !== b.length) {
|
|
14
|
+
throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
let dotProduct = 0;
|
|
18
|
+
let normA = 0;
|
|
19
|
+
let normB = 0;
|
|
20
|
+
|
|
21
|
+
for (let i = 0; i < a.length; i++) {
|
|
22
|
+
dotProduct += a[i] * b[i];
|
|
23
|
+
normA += a[i] * a[i];
|
|
24
|
+
normB += b[i] * b[i];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
28
|
+
if (magnitude === 0) return 0;
|
|
29
|
+
|
|
30
|
+
return dotProduct / magnitude;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Normalize BM25 scores to 0-1 range
|
|
35
|
+
*/
|
|
36
|
+
function normalizeBM25Scores(scores: { id: string; score: number }[]): Map<string, number> {
|
|
37
|
+
if (scores.length === 0) return new Map();
|
|
38
|
+
|
|
39
|
+
// BM25 scores are negative in SQLite FTS5 (lower is better)
|
|
40
|
+
const minScore = Math.min(...scores.map(s => s.score));
|
|
41
|
+
const maxScore = Math.max(...scores.map(s => s.score));
|
|
42
|
+
const range = maxScore - minScore || 1;
|
|
43
|
+
|
|
44
|
+
const normalized = new Map<string, number>();
|
|
45
|
+
for (const { id, score } of scores) {
|
|
46
|
+
// Invert and normalize: best match (lowest BM25) becomes highest score
|
|
47
|
+
normalized.set(id, 1 - (score - minScore) / range);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return normalized;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export class SearchEngine {
|
|
54
|
+
private db: EngramDB;
|
|
55
|
+
private config: SearchConfig;
|
|
56
|
+
|
|
57
|
+
constructor(db: EngramDB, config?: Partial<SearchConfig>) {
|
|
58
|
+
this.db = db;
|
|
59
|
+
this.config = { ...DEFAULT_CONFIG.search, ...config };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Perform semantic search using query embedding
|
|
64
|
+
*/
|
|
65
|
+
async search(
|
|
66
|
+
queryEmbedding: number[],
|
|
67
|
+
options: SearchOptions = {}
|
|
68
|
+
): Promise<SearchResult[]> {
|
|
69
|
+
const {
|
|
70
|
+
collection,
|
|
71
|
+
limit = this.config.defaultLimit,
|
|
72
|
+
threshold = 0.0,
|
|
73
|
+
hybridWeight = this.config.hybridWeight,
|
|
74
|
+
} = options;
|
|
75
|
+
|
|
76
|
+
// Get collection ID if name provided
|
|
77
|
+
let collectionId: string | undefined;
|
|
78
|
+
if (collection) {
|
|
79
|
+
const col = this.db.getCollection(collection);
|
|
80
|
+
if (!col) {
|
|
81
|
+
throw new Error(`Collection not found: ${collection}`);
|
|
82
|
+
}
|
|
83
|
+
collectionId = col.id;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Get all chunks with embeddings
|
|
87
|
+
const chunks = this.db.getAllChunksWithEmbeddings(collectionId);
|
|
88
|
+
|
|
89
|
+
if (chunks.length === 0) {
|
|
90
|
+
return [];
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Calculate vector similarity scores
|
|
94
|
+
const vectorScores: { chunk: MemoryChunk; score: number }[] = [];
|
|
95
|
+
for (const chunk of chunks) {
|
|
96
|
+
const score = cosineSimilarity(queryEmbedding, chunk.embedding);
|
|
97
|
+
if (score >= threshold) {
|
|
98
|
+
vectorScores.push({ chunk, score });
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Sort by vector score
|
|
103
|
+
vectorScores.sort((a, b) => b.score - a.score);
|
|
104
|
+
|
|
105
|
+
// If pure vector search (hybridWeight = 1), return top results
|
|
106
|
+
if (hybridWeight >= 1.0) {
|
|
107
|
+
return vectorScores.slice(0, limit).map(({ chunk, score }) => ({
|
|
108
|
+
chunk,
|
|
109
|
+
score,
|
|
110
|
+
vectorScore: score,
|
|
111
|
+
}));
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// For hybrid search, we need the query text (not available here)
|
|
115
|
+
// This will be handled at a higher level
|
|
116
|
+
return vectorScores.slice(0, limit).map(({ chunk, score }) => ({
|
|
117
|
+
chunk,
|
|
118
|
+
score,
|
|
119
|
+
vectorScore: score,
|
|
120
|
+
}));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Perform hybrid search combining vector similarity and keyword matching
|
|
125
|
+
*/
|
|
126
|
+
async hybridSearch(
|
|
127
|
+
queryEmbedding: number[],
|
|
128
|
+
queryText: string,
|
|
129
|
+
options: SearchOptions = {}
|
|
130
|
+
): Promise<SearchResult[]> {
|
|
131
|
+
const {
|
|
132
|
+
collection,
|
|
133
|
+
limit = this.config.defaultLimit,
|
|
134
|
+
threshold = 0.0,
|
|
135
|
+
hybridWeight = this.config.hybridWeight,
|
|
136
|
+
} = options;
|
|
137
|
+
|
|
138
|
+
// Get collection ID
|
|
139
|
+
let collectionId: string | undefined;
|
|
140
|
+
if (collection) {
|
|
141
|
+
const col = this.db.getCollection(collection);
|
|
142
|
+
if (!col) {
|
|
143
|
+
throw new Error(`Collection not found: ${collection}`);
|
|
144
|
+
}
|
|
145
|
+
collectionId = col.id;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Get all chunks
|
|
149
|
+
const chunks = this.db.getAllChunksWithEmbeddings(collectionId);
|
|
150
|
+
if (chunks.length === 0) return [];
|
|
151
|
+
|
|
152
|
+
// Calculate vector scores
|
|
153
|
+
const vectorScoreMap = new Map<string, number>();
|
|
154
|
+
for (const chunk of chunks) {
|
|
155
|
+
const score = cosineSimilarity(queryEmbedding, chunk.embedding);
|
|
156
|
+
vectorScoreMap.set(chunk.id, score);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Get keyword scores (BM25)
|
|
160
|
+
const keywordResults = this.db.keywordSearch(queryText, collectionId, limit * 3);
|
|
161
|
+
const keywordScoreMap = normalizeBM25Scores(keywordResults);
|
|
162
|
+
|
|
163
|
+
// Combine scores
|
|
164
|
+
const results: SearchResult[] = [];
|
|
165
|
+
const chunkMap = new Map(chunks.map(c => [c.id, c]));
|
|
166
|
+
|
|
167
|
+
// Score all chunks that have either vector or keyword hits
|
|
168
|
+
const allIds = new Set([...vectorScoreMap.keys(), ...keywordScoreMap.keys()]);
|
|
169
|
+
|
|
170
|
+
for (const id of allIds) {
|
|
171
|
+
const chunk = chunkMap.get(id);
|
|
172
|
+
if (!chunk) continue;
|
|
173
|
+
|
|
174
|
+
const vectorScore = vectorScoreMap.get(id) || 0;
|
|
175
|
+
const keywordScore = keywordScoreMap.get(id) || 0;
|
|
176
|
+
|
|
177
|
+
// Weighted combination
|
|
178
|
+
const combinedScore = hybridWeight * vectorScore + (1 - hybridWeight) * keywordScore;
|
|
179
|
+
|
|
180
|
+
if (combinedScore >= threshold) {
|
|
181
|
+
results.push({
|
|
182
|
+
chunk,
|
|
183
|
+
score: combinedScore,
|
|
184
|
+
vectorScore,
|
|
185
|
+
keywordScore,
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Sort by combined score and limit
|
|
191
|
+
results.sort((a, b) => b.score - a.score);
|
|
192
|
+
return results.slice(0, limit);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Recall a specific chunk by ID
|
|
197
|
+
*/
|
|
198
|
+
recall(id: string): MemoryChunk | null {
|
|
199
|
+
return this.db.getChunk(id);
|
|
200
|
+
}
|
|
201
|
+
}
|