collective-memory-mcp 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -7
- package/package.json +6 -3
- package/src/server.js +28 -26
- package/src/storage.js +84 -15
- package/src/vector-search.js +322 -0
package/README.md
CHANGED
|
@@ -1,16 +1,24 @@
|
|
|
1
1
|
# Collective Memory MCP Server
|
|
2
2
|
|
|
3
|
-
A persistent, graph-based memory system that enables AI agents to document their work and learn from each other's experiences. This system transforms ephemeral agent interactions into a searchable knowledge base of structural patterns, solutions, and methodologies.
|
|
3
|
+
A persistent, graph-based memory system with **vector search** that enables AI agents to document their work and learn from each other's experiences. This system transforms ephemeral agent interactions into a searchable knowledge base of structural patterns, solutions, and methodologies.
|
|
4
4
|
|
|
5
5
|
## Overview
|
|
6
6
|
|
|
7
7
|
The Collective Memory System is designed for multi-agent environments where agents need to:
|
|
8
8
|
|
|
9
9
|
- Document their completed work for future reference
|
|
10
|
-
- Discover how similar tasks were solved previously
|
|
10
|
+
- Discover how similar tasks were solved previously using **vector search**
|
|
11
11
|
- Learn from the structural patterns and approaches of other agents
|
|
12
12
|
- Coordinate across parallel executions without duplicating effort
|
|
13
13
|
|
|
14
|
+
## Key Features
|
|
15
|
+
|
|
16
|
+
- **Vector Search** - TF-IDF based search finds conceptually similar content even when keywords differ
|
|
17
|
+
- **Knowledge Graph** - Entities and relations capture complex relationships
|
|
18
|
+
- **Ranked Results** - Similarity scores help identify the most relevant past work
|
|
19
|
+
- **Zero Configuration** - Works out of the box, no external dependencies or API keys needed
|
|
20
|
+
- **Pure JavaScript** - No native dependencies, works completely offline
|
|
21
|
+
|
|
14
22
|
## Installation
|
|
15
23
|
|
|
16
24
|
```bash
|
|
@@ -42,8 +50,11 @@ Add this to your Claude system prompt to ensure agents know about the Collective
|
|
|
42
50
|
You have access to a Collective Memory MCP Server that stores knowledge from previous tasks.
|
|
43
51
|
|
|
44
52
|
BEFORE starting work, search for similar past tasks using:
|
|
45
|
-
- search_collective_memory
|
|
46
|
-
- find_similar_procedures
|
|
53
|
+
- search_collective_memory (vector search - understands meaning, not just keywords)
|
|
54
|
+
- find_similar_procedures (finds similar tasks with full implementation details)
|
|
55
|
+
|
|
56
|
+
The search uses TF-IDF vector embeddings, so it finds relevant content even when different
|
|
57
|
+
terminology is used. Results are ranked by similarity score.
|
|
47
58
|
|
|
48
59
|
AFTER completing any task, document it using:
|
|
49
60
|
- record_task_completion
|
|
@@ -52,6 +63,17 @@ When writing observations, be SPECIFIC and include facts like file paths, versio
|
|
|
52
63
|
metrics, and error messages. Avoid vague statements like "works well" or "fixed bugs".
|
|
53
64
|
```
|
|
54
65
|
|
|
66
|
+
## How Vector Search Works
|
|
67
|
+
|
|
68
|
+
This system uses **TF-IDF (Term Frequency-Inverse Document Frequency)** vector search:
|
|
69
|
+
|
|
70
|
+
- Tokenizes text into meaningful terms
|
|
71
|
+
- Calculates term importance based on frequency
|
|
72
|
+
- Uses cosine similarity to rank results
|
|
73
|
+
- Works entirely offline with no external dependencies
|
|
74
|
+
|
|
75
|
+
No configuration needed - it just works!
|
|
76
|
+
|
|
55
77
|
## Entity Types
|
|
56
78
|
|
|
57
79
|
| Type | Description |
|
|
@@ -93,7 +115,7 @@ metrics, and error messages. Avoid vague statements like "works well" or "fixed
|
|
|
93
115
|
### Query & Search
|
|
94
116
|
|
|
95
117
|
- **read_graph** - Read entire knowledge graph
|
|
96
|
-
- **search_collective_memory** -
|
|
118
|
+
- **search_collective_memory** - Vector search with ranked results
|
|
97
119
|
- **open_nodes** - Retrieve specific nodes by name
|
|
98
120
|
|
|
99
121
|
### Agent Workflow
|
|
@@ -126,12 +148,31 @@ await session.callTool("record_task_completion", {
|
|
|
126
148
|
});
|
|
127
149
|
```
|
|
128
150
|
|
|
129
|
-
### Finding Similar Procedures
|
|
151
|
+
### Finding Similar Procedures (Vector Search)
|
|
130
152
|
|
|
131
153
|
```javascript
|
|
132
154
|
const result = await session.callTool("find_similar_procedures", {
|
|
133
155
|
query: "authentication implementation"
|
|
134
156
|
});
|
|
157
|
+
|
|
158
|
+
// Returns ranked results with similarity scores:
|
|
159
|
+
// {
|
|
160
|
+
// "similar_tasks": [
|
|
161
|
+
// { "task": {...}, "score": 0.89, "artifacts": [...], "structures": [...] },
|
|
162
|
+
// { "task": {...}, "score": 0.82, "artifacts": [...], "structures": [...] }
|
|
163
|
+
// ],
|
|
164
|
+
// "search_method": "vector"
|
|
165
|
+
// }
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Searching the Collective Memory
|
|
169
|
+
|
|
170
|
+
```javascript
|
|
171
|
+
const result = await session.callTool("search_collective_memory", {
|
|
172
|
+
query: "database optimization"
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
// Returns matching entities with similarity scores
|
|
135
176
|
```
|
|
136
177
|
|
|
137
178
|
## Database
|
|
@@ -139,9 +180,18 @@ const result = await session.callTool("find_similar_procedures", {
|
|
|
139
180
|
The server uses JSON file storage for persistence. Data is stored at:
|
|
140
181
|
|
|
141
182
|
```
|
|
142
|
-
~/.collective-memory/memory.json
|
|
183
|
+
~/.collective-memory/memory.json # Knowledge graph data
|
|
143
184
|
```
|
|
144
185
|
|
|
186
|
+
## Vector Search Benefits
|
|
187
|
+
|
|
188
|
+
| Traditional Keyword Search | TF-IDF Vector Search |
|
|
189
|
+
|---------------------------|---------------------|
|
|
190
|
+
| Exact word matching required | Finds related terms automatically |
|
|
191
|
+
| No relevance ranking | Results ranked by similarity score (0-1) |
|
|
192
|
+
| "login" misses "authentication" | "login" finds "authentication", "JWT", "OAuth" |
|
|
193
|
+
| High false-positive rate | More precise, relevant results |
|
|
194
|
+
|
|
145
195
|
## Requirements
|
|
146
196
|
|
|
147
197
|
- Node.js 18+
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "collective-memory-mcp",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "A persistent, graph-based memory system for AI agents (MCP Server)",
|
|
3
|
+
"version": "0.6.1",
|
|
4
|
+
"description": "A persistent, graph-based memory system for AI agents with TF-IDF vector search (MCP Server)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/server.js",
|
|
7
7
|
"bin": {
|
|
@@ -20,7 +20,10 @@
|
|
|
20
20
|
"collective",
|
|
21
21
|
"anthropic",
|
|
22
22
|
"claude",
|
|
23
|
-
"model-context-protocol"
|
|
23
|
+
"model-context-protocol",
|
|
24
|
+
"vector-search",
|
|
25
|
+
"tf-idf",
|
|
26
|
+
"semantic-search"
|
|
24
27
|
],
|
|
25
28
|
"license": "MIT",
|
|
26
29
|
"repository": {
|
package/src/server.js
CHANGED
|
@@ -18,7 +18,7 @@ import { Entity, Relation, ENTITY_TYPES, RELATION_TYPES } from "./models.js";
|
|
|
18
18
|
/**
|
|
19
19
|
* Create and configure the MCP server
|
|
20
20
|
*/
|
|
21
|
-
function createServer() {
|
|
21
|
+
async function createServer() {
|
|
22
22
|
const storage = getStorage();
|
|
23
23
|
|
|
24
24
|
const server = new Server(
|
|
@@ -230,16 +230,16 @@ function createServer() {
|
|
|
230
230
|
{
|
|
231
231
|
name: "search_collective_memory",
|
|
232
232
|
description:
|
|
233
|
-
"**Search all past work** - Use before starting a task to learn from previous solutions. " +
|
|
234
|
-
"
|
|
235
|
-
"Returns
|
|
233
|
+
"**Search all past work using vector search** - Use before starting a task to learn from previous solutions. " +
|
|
234
|
+
"Uses TF-IDF vector search to find conceptually similar content, even with different keywords. " +
|
|
235
|
+
"Returns ranked results with similarity scores. " +
|
|
236
236
|
"Use find_similar_procedures for more detailed results with artifacts.",
|
|
237
237
|
inputSchema: {
|
|
238
238
|
type: "object",
|
|
239
239
|
properties: {
|
|
240
240
|
query: {
|
|
241
241
|
type: "string",
|
|
242
|
-
description: "What are you looking for? (e.g., 'authentication', 'CORS fix', 'database')",
|
|
242
|
+
description: "What are you looking for? Vector search understands meaning. (e.g., 'authentication', 'CORS fix', 'database')",
|
|
243
243
|
},
|
|
244
244
|
},
|
|
245
245
|
required: ["query"],
|
|
@@ -350,8 +350,9 @@ function createServer() {
|
|
|
350
350
|
{
|
|
351
351
|
name: "find_similar_procedures",
|
|
352
352
|
description:
|
|
353
|
-
"**Use BEFORE starting work** - Find how similar tasks were solved previously. " +
|
|
354
|
-
"Returns complete implementation details including artifacts and structures. " +
|
|
353
|
+
"**Use BEFORE starting work** - Find how similar tasks were solved previously using vector search. " +
|
|
354
|
+
"Returns complete implementation details including artifacts and structures, ranked by similarity. " +
|
|
355
|
+
"Understands meaning and intent using TF-IDF vectors. " +
|
|
355
356
|
"Learn from past solutions before implementing new features. " +
|
|
356
357
|
"Query examples: 'authentication', 'database migration', 'API design', 'error handling'.",
|
|
357
358
|
inputSchema: {
|
|
@@ -359,7 +360,7 @@ function createServer() {
|
|
|
359
360
|
properties: {
|
|
360
361
|
query: {
|
|
361
362
|
type: "string",
|
|
362
|
-
description: "What are you trying to do? (e.g., 'authentication implementation', 'database migration')",
|
|
363
|
+
description: "What are you trying to do? Vector search finds conceptually similar work. (e.g., 'authentication implementation', 'database migration')",
|
|
363
364
|
},
|
|
364
365
|
},
|
|
365
366
|
required: ["query"],
|
|
@@ -841,16 +842,19 @@ Future agents will read your observations to learn. Write for them, not for your
|
|
|
841
842
|
};
|
|
842
843
|
}
|
|
843
844
|
|
|
844
|
-
function searchCollectiveMemory({ query = "" }) {
|
|
845
|
-
|
|
845
|
+
async function searchCollectiveMemory({ query = "" }) {
|
|
846
|
+
// Use semantic search if available
|
|
847
|
+
const searchResult = await storage.semanticSearchEntities(query);
|
|
846
848
|
|
|
847
|
-
const results =
|
|
849
|
+
const results = searchResult.results.map((item) => {
|
|
850
|
+
const entity = item.entity;
|
|
848
851
|
const related = storage.getRelatedEntities(entity.name);
|
|
849
852
|
return {
|
|
850
853
|
name: entity.name,
|
|
851
854
|
entityType: entity.entityType,
|
|
852
855
|
observations: entity.observations,
|
|
853
856
|
createdAt: entity.createdAt,
|
|
857
|
+
score: item.score,
|
|
854
858
|
related_entities: related.connected.map((e) => ({
|
|
855
859
|
name: e.name,
|
|
856
860
|
entityType: e.entityType,
|
|
@@ -858,7 +862,11 @@ Future agents will read your observations to learn. Write for them, not for your
|
|
|
858
862
|
};
|
|
859
863
|
});
|
|
860
864
|
|
|
861
|
-
return {
|
|
865
|
+
return {
|
|
866
|
+
matching_entities: results,
|
|
867
|
+
count: results.length,
|
|
868
|
+
search_method: searchResult.method,
|
|
869
|
+
};
|
|
862
870
|
}
|
|
863
871
|
|
|
864
872
|
function openNodes({ names = [] }) {
|
|
@@ -1032,20 +1040,13 @@ Future agents will read your observations to learn. Write for them, not for your
|
|
|
1032
1040
|
};
|
|
1033
1041
|
}
|
|
1034
1042
|
|
|
1035
|
-
function findSimilarProcedures({ query = "" }) {
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
// Search for matching task entities
|
|
1039
|
-
const allEntities = storage.getAllEntities();
|
|
1040
|
-
const matchingTasks = allEntities.filter(
|
|
1041
|
-
(e) =>
|
|
1042
|
-
e.entityType === "task" &&
|
|
1043
|
-
(e.name.toLowerCase().includes(searchQuery) ||
|
|
1044
|
-
e.observations.some((obs) => obs.toLowerCase().includes(searchQuery)))
|
|
1045
|
-
);
|
|
1043
|
+
async function findSimilarProcedures({ query = "" }) {
|
|
1044
|
+
// Use semantic search for tasks, falling back to keyword search
|
|
1045
|
+
const searchResult = await storage.semanticSearchEntities(query, { entityType: "task" });
|
|
1046
1046
|
|
|
1047
1047
|
const results = [];
|
|
1048
|
-
for (const
|
|
1048
|
+
for (const item of searchResult.results) {
|
|
1049
|
+
const task = item.entity;
|
|
1049
1050
|
const taskRelations = storage.getRelations({ fromEntity: task.name });
|
|
1050
1051
|
|
|
1051
1052
|
const artifacts = [];
|
|
@@ -1083,10 +1084,11 @@ Future agents will read your observations to learn. Write for them, not for your
|
|
|
1083
1084
|
artifacts,
|
|
1084
1085
|
structures,
|
|
1085
1086
|
execution_context: executionContext,
|
|
1087
|
+
score: item.score,
|
|
1086
1088
|
});
|
|
1087
1089
|
}
|
|
1088
1090
|
|
|
1089
|
-
return { similar_tasks: results, count: results.length };
|
|
1091
|
+
return { similar_tasks: results, count: results.length, search_method: searchResult.method };
|
|
1090
1092
|
}
|
|
1091
1093
|
|
|
1092
1094
|
return server;
|
|
@@ -1096,7 +1098,7 @@ Future agents will read your observations to learn. Write for them, not for your
|
|
|
1096
1098
|
* Main entry point
|
|
1097
1099
|
*/
|
|
1098
1100
|
async function main() {
|
|
1099
|
-
const server = createServer();
|
|
1101
|
+
const server = await createServer();
|
|
1100
1102
|
const transport = new StdioServerTransport();
|
|
1101
1103
|
await server.connect(transport);
|
|
1102
1104
|
}
|
package/src/storage.js
CHANGED
|
@@ -1,24 +1,27 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Storage layer for the Collective Memory System using JSON file.
|
|
3
|
-
* Pure JavaScript - no
|
|
3
|
+
* Pure JavaScript - no external dependencies required.
|
|
4
|
+
* Uses TF-IDF vector search for semantic-like matching.
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
|
-
import { promises as fs } from "fs";
|
|
7
7
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
8
|
+
import { promises as fs } from "fs";
|
|
8
9
|
import path from "path";
|
|
9
10
|
import os from "os";
|
|
10
11
|
import { Entity, Relation } from "./models.js";
|
|
12
|
+
import { getVectorIndex, buildIndexFromEntities } from "./vector-search.js";
|
|
11
13
|
|
|
12
14
|
const DB_DIR = path.join(os.homedir(), ".collective-memory");
|
|
13
15
|
const DB_PATH = path.join(DB_DIR, "memory.json");
|
|
14
16
|
|
|
15
17
|
/**
|
|
16
|
-
*
|
|
18
|
+
* File-based storage with vector search
|
|
17
19
|
*/
|
|
18
20
|
export class Storage {
|
|
19
21
|
constructor(dbPath = DB_PATH) {
|
|
20
22
|
this.dbPath = dbPath;
|
|
21
23
|
this.data = null;
|
|
24
|
+
this.vectorIndex = getVectorIndex();
|
|
22
25
|
// Initialize synchronously
|
|
23
26
|
this.init();
|
|
24
27
|
}
|
|
@@ -41,20 +44,31 @@ export class Storage {
|
|
|
41
44
|
this.data = {
|
|
42
45
|
entities: {},
|
|
43
46
|
relations: [],
|
|
44
|
-
version: "
|
|
47
|
+
version: "2.0",
|
|
45
48
|
};
|
|
46
49
|
this.saveSync();
|
|
47
50
|
}
|
|
51
|
+
|
|
52
|
+
// Build vector index from loaded entities
|
|
53
|
+
this.rebuildIndex();
|
|
48
54
|
} catch (error) {
|
|
49
55
|
// If anything fails, start with empty data
|
|
50
56
|
this.data = {
|
|
51
57
|
entities: {},
|
|
52
58
|
relations: [],
|
|
53
|
-
version: "
|
|
59
|
+
version: "2.0",
|
|
54
60
|
};
|
|
55
61
|
}
|
|
56
62
|
}
|
|
57
63
|
|
|
64
|
+
/**
|
|
65
|
+
* Rebuild the vector search index from all entities
|
|
66
|
+
*/
|
|
67
|
+
rebuildIndex() {
|
|
68
|
+
const entities = this.getAllEntities();
|
|
69
|
+
buildIndexFromEntities(entities);
|
|
70
|
+
}
|
|
71
|
+
|
|
58
72
|
/**
|
|
59
73
|
* Save data synchronously
|
|
60
74
|
*/
|
|
@@ -81,6 +95,15 @@ export class Storage {
|
|
|
81
95
|
await fs.writeFile(this.dbPath, JSON.stringify(this.data, null, 2), "utf-8");
|
|
82
96
|
}
|
|
83
97
|
|
|
98
|
+
/**
|
|
99
|
+
* Initialize embeddings (placeholder for API compatibility)
|
|
100
|
+
* This system uses built-in TF-IDF vector search, no external embeddings needed
|
|
101
|
+
*/
|
|
102
|
+
async initEmbeddings() {
|
|
103
|
+
// Vector search is always available, no configuration needed
|
|
104
|
+
return true;
|
|
105
|
+
}
|
|
106
|
+
|
|
84
107
|
// ========== Entity Operations ==========
|
|
85
108
|
|
|
86
109
|
/**
|
|
@@ -90,7 +113,13 @@ export class Storage {
|
|
|
90
113
|
if (this.data.entities[entity.name]) {
|
|
91
114
|
return false;
|
|
92
115
|
}
|
|
116
|
+
|
|
93
117
|
this.data.entities[entity.name] = entity.toJSON();
|
|
118
|
+
|
|
119
|
+
// Add to vector index
|
|
120
|
+
this.vectorIndex.addDocument(entity.name, entity);
|
|
121
|
+
this.vectorIndex.build();
|
|
122
|
+
|
|
94
123
|
await this.save();
|
|
95
124
|
return true;
|
|
96
125
|
}
|
|
@@ -134,6 +163,9 @@ export class Storage {
|
|
|
134
163
|
entity.metadata = metadata;
|
|
135
164
|
}
|
|
136
165
|
|
|
166
|
+
// Rebuild index to reflect changes
|
|
167
|
+
this.rebuildIndex();
|
|
168
|
+
|
|
137
169
|
await this.save();
|
|
138
170
|
return true;
|
|
139
171
|
}
|
|
@@ -153,6 +185,9 @@ export class Storage {
|
|
|
153
185
|
r => r.from !== name && r.to !== name
|
|
154
186
|
);
|
|
155
187
|
|
|
188
|
+
// Rebuild index
|
|
189
|
+
this.rebuildIndex();
|
|
190
|
+
|
|
156
191
|
await this.save();
|
|
157
192
|
return true;
|
|
158
193
|
}
|
|
@@ -253,19 +288,42 @@ export class Storage {
|
|
|
253
288
|
return count;
|
|
254
289
|
}
|
|
255
290
|
|
|
256
|
-
// ========== Search ==========
|
|
291
|
+
// ========== Vector Search ==========
|
|
257
292
|
|
|
258
293
|
/**
|
|
259
|
-
*
|
|
294
|
+
* Vector search is always available (built-in TF-IDF)
|
|
260
295
|
*/
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
296
|
+
get embeddingsReady() {
|
|
297
|
+
return true;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Alias for semantic search - uses built-in TF-IDF vector search
|
|
302
|
+
* This provides semantic-like understanding without external dependencies
|
|
303
|
+
*/
|
|
304
|
+
async semanticSearchEntities(query, options = {}) {
|
|
305
|
+
return this.vectorSearchEntities(query, options);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Search entities using TF-IDF vector search
|
|
310
|
+
* Returns entities ranked by similarity score
|
|
311
|
+
*/
|
|
312
|
+
vectorSearchEntities(query, options = {}) {
|
|
313
|
+
const {
|
|
314
|
+
topK = 10,
|
|
315
|
+
threshold = 0.1,
|
|
316
|
+
entityType = null
|
|
317
|
+
} = options;
|
|
318
|
+
|
|
319
|
+
// Use vector search index
|
|
320
|
+
const result = this.vectorIndex.search(query, { topK, threshold, entityType });
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
results: result.results,
|
|
324
|
+
method: "vector",
|
|
325
|
+
count: result.count
|
|
326
|
+
};
|
|
269
327
|
}
|
|
270
328
|
|
|
271
329
|
/**
|
|
@@ -297,6 +355,17 @@ export class Storage {
|
|
|
297
355
|
return result;
|
|
298
356
|
}
|
|
299
357
|
|
|
358
|
+
/**
|
|
359
|
+
* Get index statistics
|
|
360
|
+
*/
|
|
361
|
+
getIndexStats() {
|
|
362
|
+
return {
|
|
363
|
+
...this.vectorIndex.getStats(),
|
|
364
|
+
entityCount: Object.keys(this.data.entities).length,
|
|
365
|
+
relationCount: this.data.relations.length
|
|
366
|
+
};
|
|
367
|
+
}
|
|
368
|
+
|
|
300
369
|
/**
|
|
301
370
|
* Close storage
|
|
302
371
|
*/
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector Search module for the Collective Memory System.
|
|
3
|
+
* Uses TF-IDF (Term Frequency-Inverse Document Frequency) for semantic-like search.
|
|
4
|
+
* Pure JavaScript - no external dependencies, works completely offline.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Tokenize text into terms
|
|
9
|
+
* - Converts to lowercase
|
|
10
|
+
* - Removes special characters
|
|
11
|
+
* - Splits into words
|
|
12
|
+
* - Filters stop words
|
|
13
|
+
*/
|
|
14
|
+
function tokenize(text) {
|
|
15
|
+
const stopWords = new Set([
|
|
16
|
+
"a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
|
|
17
|
+
"any", "are", "as", "at", "be", "because", "been", "before", "being", "below",
|
|
18
|
+
"between", "both", "but", "by", "can", "did", "do", "does", "doing", "don",
|
|
19
|
+
"down", "during", "each", "few", "for", "from", "further", "had", "has", "have",
|
|
20
|
+
"having", "he", "her", "here", "hers", "herself", "him", "himself", "his",
|
|
21
|
+
"how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me",
|
|
22
|
+
"might", "more", "most", "must", "my", "myself", "no", "nor", "not", "now",
|
|
23
|
+
"of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves",
|
|
24
|
+
"out", "over", "own", "s", "same", "she", "should", "so", "some", "still",
|
|
25
|
+
"such", "t", "than", "that", "the", "their", "theirs", "them", "themselves",
|
|
26
|
+
"then", "there", "these", "they", "this", "those", "through", "to", "too",
|
|
27
|
+
"under", "until", "up", "very", "was", "we", "were", "what", "when", "where",
|
|
28
|
+
"which", "while", "who", "whom", "why", "will", "with", "would", "you",
|
|
29
|
+
"your", "yours", "yourself", "yourselves", "task", "artifact", "structure",
|
|
30
|
+
"agent", "session", "entity", "description", "created", "during"
|
|
31
|
+
]);
|
|
32
|
+
|
|
33
|
+
return text
|
|
34
|
+
.toLowerCase()
|
|
35
|
+
.replace(/[^\w\s@#.-]/g, " ") // Keep @, #, ., - for technical terms
|
|
36
|
+
.split(/\s+/)
|
|
37
|
+
.filter(word => word.length > 2 && !stopWords.has(word));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Extract terms from an entity for indexing
|
|
42
|
+
*/
|
|
43
|
+
function extractEntityTerms(entity) {
|
|
44
|
+
const terms = [];
|
|
45
|
+
|
|
46
|
+
// Name has high weight
|
|
47
|
+
terms.push(...tokenize(entity.name));
|
|
48
|
+
|
|
49
|
+
// Entity type
|
|
50
|
+
terms.push(entity.entityType);
|
|
51
|
+
|
|
52
|
+
// All observations
|
|
53
|
+
if (entity.observations) {
|
|
54
|
+
for (const obs of entity.observations) {
|
|
55
|
+
terms.push(...tokenize(obs));
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Metadata
|
|
60
|
+
if (entity.metadata) {
|
|
61
|
+
for (const value of Object.values(entity.metadata)) {
|
|
62
|
+
if (typeof value === "string") {
|
|
63
|
+
terms.push(...tokenize(value));
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return terms;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Calculate term frequency for a document
|
|
73
|
+
*/
|
|
74
|
+
function calculateTermFrequency(terms) {
|
|
75
|
+
const tf = {};
|
|
76
|
+
const totalTerms = terms.length;
|
|
77
|
+
|
|
78
|
+
for (const term of terms) {
|
|
79
|
+
tf[term] = (tf[term] || 0) + 1;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Normalize by document length
|
|
83
|
+
for (const term in tf) {
|
|
84
|
+
tf[term] = tf[term] / totalTerms;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return tf;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Calculate inverse document frequency
|
|
92
|
+
*/
|
|
93
|
+
function calculateIDF(documents) {
|
|
94
|
+
const idf = {};
|
|
95
|
+
const totalDocs = documents.length;
|
|
96
|
+
|
|
97
|
+
// Count documents containing each term
|
|
98
|
+
for (const doc of documents) {
|
|
99
|
+
const uniqueTerms = new Set(doc.terms);
|
|
100
|
+
for (const term of uniqueTerms) {
|
|
101
|
+
idf[term] = (idf[term] || 0) + 1;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Calculate IDF
|
|
106
|
+
for (const term in idf) {
|
|
107
|
+
idf[term] = Math.log(totalDocs / (1 + idf[term]));
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return idf;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Create a TF-IDF vector for a document
|
|
115
|
+
*/
|
|
116
|
+
function createTFIDFVector(tf, idf, allTerms) {
|
|
117
|
+
const vector = [];
|
|
118
|
+
|
|
119
|
+
for (const term of allTerms) {
|
|
120
|
+
const tfValue = tf[term] || 0;
|
|
121
|
+
const idfValue = idf[term] || 0;
|
|
122
|
+
vector.push(tfValue * idfValue);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return vector;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Calculate cosine similarity between two vectors
|
|
130
|
+
*/
|
|
131
|
+
function cosineSimilarity(a, b) {
|
|
132
|
+
if (a.length !== b.length) {
|
|
133
|
+
return 0;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
let dotProduct = 0;
|
|
137
|
+
let normA = 0;
|
|
138
|
+
let normB = 0;
|
|
139
|
+
|
|
140
|
+
for (let i = 0; i < a.length; i++) {
|
|
141
|
+
dotProduct += a[i] * b[i];
|
|
142
|
+
normA += a[i] * a[i];
|
|
143
|
+
normB += b[i] * b[i];
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
normA = Math.sqrt(normA);
|
|
147
|
+
normB = Math.sqrt(normB);
|
|
148
|
+
|
|
149
|
+
if (normA === 0 || normB === 0) {
|
|
150
|
+
return 0;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return dotProduct / (normA * normB);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Vector Search Index
|
|
158
|
+
*/
|
|
159
|
+
class VectorSearchIndex {
|
|
160
|
+
constructor() {
|
|
161
|
+
this.documents = [];
|
|
162
|
+
this.allTerms = new Set();
|
|
163
|
+
this.idf = {};
|
|
164
|
+
this.built = false;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Add a document to the index
|
|
169
|
+
*/
|
|
170
|
+
addDocument(id, entity) {
|
|
171
|
+
const terms = extractEntityTerms(entity);
|
|
172
|
+
|
|
173
|
+
this.documents.push({
|
|
174
|
+
id,
|
|
175
|
+
entity,
|
|
176
|
+
terms,
|
|
177
|
+
tf: null,
|
|
178
|
+
vector: null
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
for (const term of terms) {
|
|
182
|
+
this.allTerms.add(term);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
this.built = false;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Build the index (calculate TF-IDF vectors)
|
|
190
|
+
*/
|
|
191
|
+
build() {
|
|
192
|
+
if (this.documents.length === 0) {
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
const termList = Array.from(this.allTerms);
|
|
197
|
+
|
|
198
|
+
// Calculate IDF for all terms
|
|
199
|
+
this.idf = calculateIDF(this.documents);
|
|
200
|
+
|
|
201
|
+
// Calculate TF and create vectors for each document
|
|
202
|
+
for (const doc of this.documents) {
|
|
203
|
+
doc.tf = calculateTermFrequency(doc.terms);
|
|
204
|
+
doc.vector = createTFIDFVector(doc.tf, this.idf, termList);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
this.allTermsList = termList;
|
|
208
|
+
this.built = true;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Search the index
|
|
213
|
+
*/
|
|
214
|
+
search(query, options = {}) {
|
|
215
|
+
const {
|
|
216
|
+
topK = 10,
|
|
217
|
+
threshold = 0.1,
|
|
218
|
+
entityType = null
|
|
219
|
+
} = options;
|
|
220
|
+
|
|
221
|
+
// Build if not already built
|
|
222
|
+
if (!this.built) {
|
|
223
|
+
this.build();
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Tokenize query
|
|
227
|
+
const queryTerms = tokenize(query);
|
|
228
|
+
if (queryTerms.length === 0) {
|
|
229
|
+
return { results: [], method: "vector", count: 0 };
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Create query vector
|
|
233
|
+
const queryTF = calculateTermFrequency(queryTerms);
|
|
234
|
+
const queryVector = createTFIDFVector(queryTF, this.idf, this.allTermsList);
|
|
235
|
+
|
|
236
|
+
// Calculate similarities
|
|
237
|
+
const results = this.documents
|
|
238
|
+
.filter(doc => !entityType || doc.entity.entityType === entityType)
|
|
239
|
+
.map(doc => {
|
|
240
|
+
const score = cosineSimilarity(queryVector, doc.vector);
|
|
241
|
+
return {
|
|
242
|
+
entity: doc.entity,
|
|
243
|
+
score
|
|
244
|
+
};
|
|
245
|
+
})
|
|
246
|
+
.filter(r => r.score >= threshold)
|
|
247
|
+
.sort((a, b) => b.score - a.score)
|
|
248
|
+
.slice(0, topK);
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
results,
|
|
252
|
+
method: "vector",
|
|
253
|
+
count: results.length
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Clear the index
|
|
259
|
+
*/
|
|
260
|
+
clear() {
|
|
261
|
+
this.documents = [];
|
|
262
|
+
this.allTerms = new Set();
|
|
263
|
+
this.idf = {};
|
|
264
|
+
this.built = false;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Get index statistics
|
|
269
|
+
*/
|
|
270
|
+
getStats() {
|
|
271
|
+
return {
|
|
272
|
+
documentCount: this.documents.length,
|
|
273
|
+
uniqueTermCount: this.allTerms.size,
|
|
274
|
+
built: this.built
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Singleton instance
|
|
281
|
+
*/
|
|
282
|
+
let indexInstance = null;
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Get or create the vector search index
|
|
286
|
+
*/
|
|
287
|
+
export function getVectorIndex() {
|
|
288
|
+
if (!indexInstance) {
|
|
289
|
+
indexInstance = new VectorSearchIndex();
|
|
290
|
+
}
|
|
291
|
+
return indexInstance;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Rebuild index from entities
|
|
296
|
+
*/
|
|
297
|
+
export function buildIndexFromEntities(entities) {
|
|
298
|
+
const index = getVectorIndex();
|
|
299
|
+
index.clear();
|
|
300
|
+
|
|
301
|
+
for (const entity of entities) {
|
|
302
|
+
index.addDocument(entity.name, entity);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
index.build();
|
|
306
|
+
return index;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
export {
|
|
310
|
+
VectorSearchIndex,
|
|
311
|
+
tokenize,
|
|
312
|
+
extractEntityTerms,
|
|
313
|
+
cosineSimilarity,
|
|
314
|
+
calculateTermFrequency,
|
|
315
|
+
calculateIDF
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
export default {
|
|
319
|
+
getVectorIndex,
|
|
320
|
+
buildIndexFromEntities,
|
|
321
|
+
VectorSearchIndex
|
|
322
|
+
};
|