ocerebro 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cerebro/index/embeddings.db +0 -0
- package/cerebro/index/entities.db +0 -0
- package/cerebro/index/metadata.db +0 -0
- package/package.json +1 -1
- package/pyproject.toml +2 -1
- package/src/consolidation/promoter.py +71 -1
- package/src/index/entities_db.py +822 -0
- package/src/index/queries.py +81 -11
- package/src/mcp/server.py +146 -3
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
package/pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ocerebro"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -37,6 +37,7 @@ dependencies = [
|
|
|
37
37
|
"sentence-transformers>=2.2.0",
|
|
38
38
|
"mcp>=1.0.0",
|
|
39
39
|
"anthropic>=0.40.0",
|
|
40
|
+
"spacy>=3.5.0",
|
|
40
41
|
]
|
|
41
42
|
|
|
42
43
|
[project.optional-dependencies]
|
|
@@ -37,7 +37,8 @@ class Promoter:
|
|
|
37
37
|
def __init__(
|
|
38
38
|
self,
|
|
39
39
|
working_storage: YAMLStorage,
|
|
40
|
-
official_storage: MarkdownStorage
|
|
40
|
+
official_storage: MarkdownStorage,
|
|
41
|
+
entities_db_path: Optional[Path] = None
|
|
41
42
|
):
|
|
42
43
|
"""
|
|
43
44
|
Inicializa o Promoter.
|
|
@@ -45,9 +46,12 @@ class Promoter:
|
|
|
45
46
|
Args:
|
|
46
47
|
working_storage: Instância do YAMLStorage
|
|
47
48
|
official_storage: Instância do MarkdownStorage
|
|
49
|
+
entities_db_path: Path para o EntitiesDB (opcional)
|
|
48
50
|
"""
|
|
49
51
|
self.working_storage = working_storage
|
|
50
52
|
self.official_storage = official_storage
|
|
53
|
+
self.entities_db_path = entities_db_path
|
|
54
|
+
self._entities_db = None
|
|
51
55
|
|
|
52
56
|
def promote_session(
|
|
53
57
|
self,
|
|
@@ -223,6 +227,10 @@ class Promoter:
|
|
|
223
227
|
content=content
|
|
224
228
|
)
|
|
225
229
|
|
|
230
|
+
# Extrai entidades do frontmatter E do conteúdo
|
|
231
|
+
self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
|
|
232
|
+
self._extract_entities_from_content(draft_id, content)
|
|
233
|
+
|
|
226
234
|
return PromotionResult(
|
|
227
235
|
success=True,
|
|
228
236
|
source_type=draft.get("type", "session"),
|
|
@@ -300,6 +308,10 @@ class Promoter:
|
|
|
300
308
|
content=body
|
|
301
309
|
)
|
|
302
310
|
|
|
311
|
+
# Extrai entidades do frontmatter E do conteúdo
|
|
312
|
+
self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
|
|
313
|
+
self._extract_entities_from_content(draft_id, body)
|
|
314
|
+
|
|
303
315
|
return PromotionResult(
|
|
304
316
|
success=True,
|
|
305
317
|
source_type=draft.get("type", "session"),
|
|
@@ -433,3 +445,61 @@ class Promoter:
|
|
|
433
445
|
if existing:
|
|
434
446
|
existing.update(draft)
|
|
435
447
|
self.working_storage.write_feature(project, draft_id, existing)
|
|
448
|
+
|
|
449
|
+
@property
|
|
450
|
+
def entities_db(self):
|
|
451
|
+
"""Lazy load do EntitiesDB"""
|
|
452
|
+
if self._entities_db is None and self.entities_db_path:
|
|
453
|
+
from src.index.entities_db import EntitiesDB
|
|
454
|
+
self._entities_db = EntitiesDB(self.entities_db_path)
|
|
455
|
+
return self._entities_db
|
|
456
|
+
|
|
457
|
+
def _extract_entities_from_frontmatter(
|
|
458
|
+
self,
|
|
459
|
+
memory_id: str,
|
|
460
|
+
frontmatter: Dict[str, Any],
|
|
461
|
+
project: str
|
|
462
|
+
) -> List[str]:
|
|
463
|
+
"""
|
|
464
|
+
Extrai entidades do frontmatter e registra no grafo.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
memory_id: ID da memória
|
|
468
|
+
frontmatter: Dicionário com metadados
|
|
469
|
+
project: Nome do projeto
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
Lista de IDs de entidades criadas
|
|
473
|
+
"""
|
|
474
|
+
if not self.entities_db:
|
|
475
|
+
return []
|
|
476
|
+
|
|
477
|
+
return self.entities_db.extract_from_frontmatter(
|
|
478
|
+
memory_id=memory_id,
|
|
479
|
+
frontmatter=frontmatter,
|
|
480
|
+
project=project
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
def _extract_entities_from_content(
|
|
484
|
+
self,
|
|
485
|
+
memory_id: str,
|
|
486
|
+
content: str
|
|
487
|
+
) -> List[str]:
|
|
488
|
+
"""
|
|
489
|
+
Extrai entidades do conteúdo usando spaCy NER e registra no grafo.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
memory_id: ID da memória
|
|
493
|
+
content: Conteúdo de texto
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Lista de IDs de entidades criadas
|
|
497
|
+
"""
|
|
498
|
+
if not self.entities_db:
|
|
499
|
+
return []
|
|
500
|
+
|
|
501
|
+
return self.entities_db.extract_from_content(
|
|
502
|
+
memory_id=memory_id,
|
|
503
|
+
content=content,
|
|
504
|
+
use_spacy=True
|
|
505
|
+
)
|
|
@@ -0,0 +1,822 @@
|
|
|
1
|
+
"""EntitiesDB: Grafo de experiência com entidades e relacionamentos"""
|
|
2
|
+
|
|
3
|
+
import sqlite3
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Set
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from collections import deque
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EntitiesDB:
|
|
12
|
+
"""
|
|
13
|
+
Banco de dados para grafo de experiência usando SQLite.
|
|
14
|
+
|
|
15
|
+
Armazena entidades extraídas de memórias (ORG, PERSON, TECH, etc)
|
|
16
|
+
e relacionamentos entre elas. Permite busca associativa por traversal.
|
|
17
|
+
|
|
18
|
+
Diferencial vs LightRAG:
|
|
19
|
+
- Extração local com spaCy NER (offline, grátis)
|
|
20
|
+
- Frontmatter como nós iniciais (sem LLM)
|
|
21
|
+
- Arestas implícitas por projeto/tags/type
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, db_path: Path):
|
|
25
|
+
"""
|
|
26
|
+
Inicializa o EntitiesDB.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
db_path: Path para o arquivo do banco
|
|
30
|
+
"""
|
|
31
|
+
self.db_path = db_path
|
|
32
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
self._init_schema()
|
|
34
|
+
|
|
35
|
+
def _connect(self) -> sqlite3.Connection:
|
|
36
|
+
"""Cria conexão com o banco"""
|
|
37
|
+
conn = sqlite3.connect(self.db_path)
|
|
38
|
+
conn.row_factory = sqlite3.Row
|
|
39
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
40
|
+
return conn
|
|
41
|
+
|
|
42
|
+
def _init_schema(self):
|
|
43
|
+
"""Cria schema do banco"""
|
|
44
|
+
conn = self._connect()
|
|
45
|
+
|
|
46
|
+
# Tabela de entidades
|
|
47
|
+
conn.execute("""
|
|
48
|
+
CREATE TABLE IF NOT EXISTS entities (
|
|
49
|
+
id TEXT PRIMARY KEY,
|
|
50
|
+
memory_id TEXT,
|
|
51
|
+
entity_name TEXT,
|
|
52
|
+
entity_type TEXT,
|
|
53
|
+
confidence REAL DEFAULT 1.0,
|
|
54
|
+
span_start INTEGER,
|
|
55
|
+
span_end INTEGER,
|
|
56
|
+
context_snippet TEXT,
|
|
57
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
58
|
+
FOREIGN KEY (memory_id) REFERENCES memories(id)
|
|
59
|
+
)
|
|
60
|
+
""")
|
|
61
|
+
|
|
62
|
+
# Índices para performance
|
|
63
|
+
conn.execute("""
|
|
64
|
+
CREATE INDEX IF NOT EXISTS idx_entities_name
|
|
65
|
+
ON entities(entity_name)
|
|
66
|
+
""")
|
|
67
|
+
|
|
68
|
+
conn.execute("""
|
|
69
|
+
CREATE INDEX IF NOT EXISTS idx_entities_type
|
|
70
|
+
ON entities(entity_type)
|
|
71
|
+
""")
|
|
72
|
+
|
|
73
|
+
conn.execute("""
|
|
74
|
+
CREATE INDEX IF NOT EXISTS idx_entities_memory
|
|
75
|
+
ON entities(memory_id)
|
|
76
|
+
""")
|
|
77
|
+
|
|
78
|
+
# Tabela de cache de hash (para evitar reprocessamento)
|
|
79
|
+
conn.execute("""
|
|
80
|
+
CREATE TABLE IF NOT EXISTS entity_cache (
|
|
81
|
+
memory_id TEXT PRIMARY KEY,
|
|
82
|
+
content_hash TEXT,
|
|
83
|
+
processed_at TEXT DEFAULT (datetime('now')),
|
|
84
|
+
entity_count INTEGER DEFAULT 0
|
|
85
|
+
)
|
|
86
|
+
""")
|
|
87
|
+
|
|
88
|
+
conn.execute("""
|
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_entity_cache_hash
|
|
90
|
+
ON entity_cache(content_hash)
|
|
91
|
+
""")
|
|
92
|
+
|
|
93
|
+
# Tabela de relacionamentos
|
|
94
|
+
conn.execute("""
|
|
95
|
+
CREATE TABLE IF NOT EXISTS entity_relationships (
|
|
96
|
+
id TEXT PRIMARY KEY,
|
|
97
|
+
source_entity TEXT,
|
|
98
|
+
target_entity TEXT,
|
|
99
|
+
relationship_type TEXT,
|
|
100
|
+
memory_id TEXT,
|
|
101
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
102
|
+
FOREIGN KEY (memory_id) REFERENCES memories(id)
|
|
103
|
+
)
|
|
104
|
+
""")
|
|
105
|
+
|
|
106
|
+
conn.execute("""
|
|
107
|
+
CREATE INDEX IF NOT EXISTS idx_relationships_source
|
|
108
|
+
ON entity_relationships(source_entity)
|
|
109
|
+
""")
|
|
110
|
+
|
|
111
|
+
conn.execute("""
|
|
112
|
+
CREATE INDEX IF NOT EXISTS idx_relationships_target
|
|
113
|
+
ON entity_relationships(target_entity)
|
|
114
|
+
""")
|
|
115
|
+
|
|
116
|
+
conn.commit()
|
|
117
|
+
conn.close()
|
|
118
|
+
|
|
119
|
+
# ========================================================================
|
|
120
|
+
# OPERAÇÕES DE ENTIDADES
|
|
121
|
+
# ========================================================================
|
|
122
|
+
|
|
123
|
+
def insert_entity(
|
|
124
|
+
self,
|
|
125
|
+
memory_id: str,
|
|
126
|
+
entity_name: str,
|
|
127
|
+
entity_type: str,
|
|
128
|
+
confidence: float = 1.0,
|
|
129
|
+
span_start: int = 0,
|
|
130
|
+
span_end: int = 0,
|
|
131
|
+
context_snippet: str = ""
|
|
132
|
+
) -> str:
|
|
133
|
+
"""
|
|
134
|
+
Insere uma entidade.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
memory_id: ID da memória de origem
|
|
138
|
+
entity_name: Nome da entidade
|
|
139
|
+
entity_type: Tipo (ORG, PERSON, TECH, etc)
|
|
140
|
+
confidence: Confiança da extração (0-1)
|
|
141
|
+
span_start: Posição inicial no texto
|
|
142
|
+
span_end: Posição final no texto
|
|
143
|
+
context_snippet: Contexto ao redor da entidade
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
ID da entidade
|
|
147
|
+
"""
|
|
148
|
+
entity_id = f"ent_{memory_id}_{entity_name.lower().replace(' ', '_')}"
|
|
149
|
+
|
|
150
|
+
conn = self._connect()
|
|
151
|
+
conn.execute("""
|
|
152
|
+
INSERT OR REPLACE INTO entities
|
|
153
|
+
(id, memory_id, entity_name, entity_type, confidence, span_start, span_end, context_snippet)
|
|
154
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
155
|
+
""", (
|
|
156
|
+
entity_id,
|
|
157
|
+
memory_id,
|
|
158
|
+
entity_name,
|
|
159
|
+
entity_type,
|
|
160
|
+
confidence,
|
|
161
|
+
span_start,
|
|
162
|
+
span_end,
|
|
163
|
+
context_snippet
|
|
164
|
+
))
|
|
165
|
+
conn.commit()
|
|
166
|
+
conn.close()
|
|
167
|
+
|
|
168
|
+
return entity_id
|
|
169
|
+
|
|
170
|
+
def get_entities_by_memory(self, memory_id: str) -> List[Dict[str, Any]]:
|
|
171
|
+
"""
|
|
172
|
+
Obtém entidades de uma memória.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
memory_id: ID da memória
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Lista de entidades
|
|
179
|
+
"""
|
|
180
|
+
conn = self._connect()
|
|
181
|
+
cursor = conn.execute(
|
|
182
|
+
"SELECT * FROM entities WHERE memory_id = ?",
|
|
183
|
+
(memory_id,)
|
|
184
|
+
)
|
|
185
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
186
|
+
conn.close()
|
|
187
|
+
return results
|
|
188
|
+
|
|
189
|
+
def get_entities_by_name(self, entity_name: str) -> List[Dict[str, Any]]:
|
|
190
|
+
"""
|
|
191
|
+
Busca entidades por nome (case-insensitive).
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
entity_name: Nome da entidade
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Lista de entidades
|
|
198
|
+
"""
|
|
199
|
+
conn = self._connect()
|
|
200
|
+
cursor = conn.execute(
|
|
201
|
+
"SELECT * FROM entities WHERE LOWER(entity_name) = LOWER(?)",
|
|
202
|
+
(entity_name,)
|
|
203
|
+
)
|
|
204
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
205
|
+
conn.close()
|
|
206
|
+
return results
|
|
207
|
+
|
|
208
|
+
def delete_entities_by_memory(self, memory_id: str) -> int:
|
|
209
|
+
"""
|
|
210
|
+
Remove entidades de uma memória.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
memory_id: ID da memória
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Número de entidades removidas
|
|
217
|
+
"""
|
|
218
|
+
conn = self._connect()
|
|
219
|
+
cursor = conn.execute(
|
|
220
|
+
"DELETE FROM entities WHERE memory_id = ?",
|
|
221
|
+
(memory_id,)
|
|
222
|
+
)
|
|
223
|
+
deleted = cursor.rowcount
|
|
224
|
+
|
|
225
|
+
# Remove relacionamentos também
|
|
226
|
+
conn.execute(
|
|
227
|
+
"DELETE FROM entity_relationships WHERE memory_id = ?",
|
|
228
|
+
(memory_id,)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
conn.commit()
|
|
232
|
+
conn.close()
|
|
233
|
+
return deleted
|
|
234
|
+
|
|
235
|
+
# ========================================================================
|
|
236
|
+
# OPERAÇÕES DE RELACIONAMENTOS
|
|
237
|
+
# ========================================================================
|
|
238
|
+
|
|
239
|
+
def insert_relationship(
|
|
240
|
+
self,
|
|
241
|
+
source_entity: str,
|
|
242
|
+
target_entity: str,
|
|
243
|
+
relationship_type: str,
|
|
244
|
+
memory_id: str
|
|
245
|
+
) -> str:
|
|
246
|
+
"""
|
|
247
|
+
Insere relacionamento entre entidades.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
source_entity: Nome da entidade origem
|
|
251
|
+
target_entity: Nome da entidade alvo
|
|
252
|
+
relationship_type: Tipo do relacionamento
|
|
253
|
+
memory_id: ID da memória de origem
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
ID do relacionamento
|
|
257
|
+
"""
|
|
258
|
+
rel_id = f"rel_{source_entity}_{target_entity}_{memory_id}"
|
|
259
|
+
|
|
260
|
+
conn = self._connect()
|
|
261
|
+
conn.execute("""
|
|
262
|
+
INSERT OR REPLACE INTO entity_relationships
|
|
263
|
+
(id, source_entity, target_entity, relationship_type, memory_id)
|
|
264
|
+
VALUES (?, ?, ?, ?, ?)
|
|
265
|
+
""", (rel_id, source_entity, target_entity, relationship_type, memory_id))
|
|
266
|
+
conn.commit()
|
|
267
|
+
conn.close()
|
|
268
|
+
|
|
269
|
+
return rel_id
|
|
270
|
+
|
|
271
|
+
def get_relationships(self, entity_name: str) -> List[Dict[str, Any]]:
|
|
272
|
+
"""
|
|
273
|
+
Obtém relacionamentos de uma entidade.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
entity_name: Nome da entidade
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Lista de relacionamentos (ida e volta)
|
|
280
|
+
"""
|
|
281
|
+
conn = self._connect()
|
|
282
|
+
|
|
283
|
+
# Relacionamentos onde é origem
|
|
284
|
+
cursor = conn.execute("""
|
|
285
|
+
SELECT 'outgoing' as direction, r.*, e.entity_type as target_type
|
|
286
|
+
FROM entity_relationships r
|
|
287
|
+
LEFT JOIN entities e ON LOWER(e.entity_name) = LOWER(r.target_entity)
|
|
288
|
+
WHERE LOWER(r.source_entity) = LOWER(?)
|
|
289
|
+
""", (entity_name,))
|
|
290
|
+
outgoing = [dict(row) for row in cursor.fetchall()]
|
|
291
|
+
|
|
292
|
+
# Relacionamentos onde é alvo
|
|
293
|
+
cursor = conn.execute("""
|
|
294
|
+
SELECT 'incoming' as direction, r.*, e.entity_type as source_type
|
|
295
|
+
FROM entity_relationships r
|
|
296
|
+
LEFT JOIN entities e ON LOWER(e.entity_name) = LOWER(r.source_entity)
|
|
297
|
+
WHERE LOWER(r.target_entity) = LOWER(?)
|
|
298
|
+
""", (entity_name,))
|
|
299
|
+
incoming = [dict(row) for row in cursor.fetchall()]
|
|
300
|
+
|
|
301
|
+
conn.close()
|
|
302
|
+
return outgoing + incoming
|
|
303
|
+
|
|
304
|
+
# ========================================================================
|
|
305
|
+
# TRAVESSIA DO GRAFO (BFS)
|
|
306
|
+
# ========================================================================
|
|
307
|
+
|
|
308
|
+
def traverse(
|
|
309
|
+
self,
|
|
310
|
+
start_entity: str,
|
|
311
|
+
depth: int = 2,
|
|
312
|
+
entity_types: Optional[List[str]] = None,
|
|
313
|
+
max_nodes: int = 50
|
|
314
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
315
|
+
"""
|
|
316
|
+
Faz traversal BFS a partir de uma entidade.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
start_entity: Nome da entidade inicial
|
|
320
|
+
depth: Profundidade máxima (1-3 recomendado)
|
|
321
|
+
entity_types: Filtrar por tipos de entidade
|
|
322
|
+
max_nodes: Limite máximo de nós retornados
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Tupla (nodes, edges) para visualização do grafo
|
|
326
|
+
"""
|
|
327
|
+
nodes: Dict[str, Dict[str, Any]] = {}
|
|
328
|
+
edges: List[Dict[str, Any]] = []
|
|
329
|
+
|
|
330
|
+
# Fila BFS: (entidade, profundidade)
|
|
331
|
+
queue: deque = deque([(start_entity, 0)])
|
|
332
|
+
visited: Set[str] = set()
|
|
333
|
+
|
|
334
|
+
while queue and len(nodes) < max_nodes:
|
|
335
|
+
entity_name, current_depth = queue.popleft()
|
|
336
|
+
|
|
337
|
+
if entity_name in visited:
|
|
338
|
+
continue
|
|
339
|
+
visited.add(entity_name)
|
|
340
|
+
|
|
341
|
+
# Busca entidade
|
|
342
|
+
entities = self.get_entities_by_name(entity_name)
|
|
343
|
+
if not entities:
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
entity = entities[0] # Pega primeiro match
|
|
347
|
+
|
|
348
|
+
# Filtra por tipo se especificado
|
|
349
|
+
if entity_types and entity["entity_type"] not in entity_types:
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
# Adiciona nó
|
|
353
|
+
node_key = entity["entity_name"].lower()
|
|
354
|
+
if node_key not in nodes:
|
|
355
|
+
nodes[node_key] = {
|
|
356
|
+
"name": entity["entity_name"],
|
|
357
|
+
"type": entity["entity_type"],
|
|
358
|
+
"depth": current_depth,
|
|
359
|
+
"memory_count": 1
|
|
360
|
+
}
|
|
361
|
+
else:
|
|
362
|
+
nodes[node_key]["memory_count"] += 1
|
|
363
|
+
|
|
364
|
+
# Se atingiu profundidade máxima, não expande
|
|
365
|
+
if current_depth >= depth:
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
# Busca relacionamentos
|
|
369
|
+
relationships = self.get_relationships(entity_name)
|
|
370
|
+
|
|
371
|
+
for rel in relationships:
|
|
372
|
+
if rel["direction"] == "outgoing":
|
|
373
|
+
target = rel["target_entity"]
|
|
374
|
+
source = rel["source_entity"]
|
|
375
|
+
else:
|
|
376
|
+
target = rel["target_entity"]
|
|
377
|
+
source = rel["source_entity"]
|
|
378
|
+
|
|
379
|
+
# Adiciona aresta
|
|
380
|
+
edge_key = f"{source.lower()}_{target.lower()}"
|
|
381
|
+
if not any(e.get("key") == edge_key for e in edges):
|
|
382
|
+
edges.append({
|
|
383
|
+
"key": edge_key,
|
|
384
|
+
"source": source,
|
|
385
|
+
"target": target,
|
|
386
|
+
"type": rel["relationship_type"],
|
|
387
|
+
"memory_id": rel.get("memory_id")
|
|
388
|
+
})
|
|
389
|
+
|
|
390
|
+
# Adiciona próximo nó na fila
|
|
391
|
+
next_entity = target if rel["direction"] == "outgoing" else source
|
|
392
|
+
if next_entity not in visited:
|
|
393
|
+
queue.append((next_entity, current_depth + 1))
|
|
394
|
+
|
|
395
|
+
return list(nodes.values()), edges
|
|
396
|
+
|
|
397
|
+
# ========================================================================
|
|
398
|
+
# BUSCA POR ENTIDADES (integra com QueryEngine)
|
|
399
|
+
# ========================================================================
|
|
400
|
+
|
|
401
|
+
def search_by_query(self, query: str, limit: int = 20) -> List[Dict[str, Any]]:
|
|
402
|
+
"""
|
|
403
|
+
Busca memórias por entidades relacionadas à query.
|
|
404
|
+
|
|
405
|
+
Extrai entidades da query e retorna memórias conectadas.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
query: Texto de busca
|
|
409
|
+
limit: Limite de resultados
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
Lista de memórias com score de grafo
|
|
413
|
+
"""
|
|
414
|
+
# Tenta extrair entidades da query (palavras-chave)
|
|
415
|
+
query_entities = self._extract_query_entities(query)
|
|
416
|
+
|
|
417
|
+
if not query_entities:
|
|
418
|
+
return []
|
|
419
|
+
|
|
420
|
+
# Busca memórias conectadas às entidades
|
|
421
|
+
conn = self._connect()
|
|
422
|
+
|
|
423
|
+
results = {}
|
|
424
|
+
for entity_name in query_entities:
|
|
425
|
+
cursor = conn.execute("""
|
|
426
|
+
SELECT DISTINCT e.memory_id, e.entity_name, e.entity_type,
|
|
427
|
+
COUNT(*) as entity_count
|
|
428
|
+
FROM entities e
|
|
429
|
+
WHERE LOWER(e.entity_name) = LOWER(?)
|
|
430
|
+
GROUP BY e.memory_id
|
|
431
|
+
ORDER BY entity_count DESC
|
|
432
|
+
LIMIT ?
|
|
433
|
+
""", (entity_name, limit))
|
|
434
|
+
|
|
435
|
+
for row in cursor.fetchall():
|
|
436
|
+
memory_id = row["memory_id"]
|
|
437
|
+
if memory_id not in results:
|
|
438
|
+
results[memory_id] = {
|
|
439
|
+
"memory_id": memory_id,
|
|
440
|
+
"matched_entities": [],
|
|
441
|
+
"score": 0.0
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
results[memory_id]["matched_entities"].append({
|
|
445
|
+
"name": row["entity_name"],
|
|
446
|
+
"type": row["entity_type"]
|
|
447
|
+
})
|
|
448
|
+
results[memory_id]["score"] += 0.5 # Score base por entidade
|
|
449
|
+
|
|
450
|
+
conn.close()
|
|
451
|
+
|
|
452
|
+
# Normaliza scores
|
|
453
|
+
if results:
|
|
454
|
+
max_score = max(r["score"] for r in results.values())
|
|
455
|
+
for r in results.values():
|
|
456
|
+
r["score"] = r["score"] / max_score if max_score > 0 else 0
|
|
457
|
+
|
|
458
|
+
return list(results.values())
|
|
459
|
+
|
|
460
|
+
def _extract_query_entities(self, query: str) -> List[str]:
|
|
461
|
+
"""
|
|
462
|
+
Extrai possíveis entidades de uma query.
|
|
463
|
+
|
|
464
|
+
Usa heurísticas simples (sem spaCy para evitar dependência aqui):
|
|
465
|
+
- Palavras capitalizadas
|
|
466
|
+
- Termos entre aspas
|
|
467
|
+
- Acrônimos
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
query: Texto de busca
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
Lista de nomes de entidades candidatas
|
|
474
|
+
"""
|
|
475
|
+
entities = set()
|
|
476
|
+
|
|
477
|
+
# Termos entre aspas
|
|
478
|
+
quoted = re.findall(r'"([^"]+)"', query)
|
|
479
|
+
entities.update(quoted)
|
|
480
|
+
|
|
481
|
+
# Palavras capitalizadas (prováveis nomes próprios)
|
|
482
|
+
capitalized = re.findall(r'\b[A-Z][a-zA-Z]*\b', query)
|
|
483
|
+
entities.update(capitalized)
|
|
484
|
+
|
|
485
|
+
# Acrônimos
|
|
486
|
+
acronyms = re.findall(r'\b[A-Z]{2,}\b', query)
|
|
487
|
+
entities.update(acronyms)
|
|
488
|
+
|
|
489
|
+
# Remove stop words e termos muito curtos
|
|
490
|
+
stop_words = {"A", "O", "Os", "As", "Um", "Uma", "Em", "De", "Do", "Da", "Com", "Por", "Para"}
|
|
491
|
+
entities = {e for e in entities if e not in stop_words and len(e) > 2}
|
|
492
|
+
|
|
493
|
+
return list(entities)
|
|
494
|
+
|
|
495
|
+
# ========================================================================
|
|
496
|
+
# MÉTODOS DE INTEGRAÇÃO COM FRONTMATTER
|
|
497
|
+
# ========================================================================
|
|
498
|
+
|
|
499
|
+
def extract_from_frontmatter(
|
|
500
|
+
self,
|
|
501
|
+
memory_id: str,
|
|
502
|
+
frontmatter: Dict[str, Any],
|
|
503
|
+
project: str
|
|
504
|
+
) -> List[str]:
|
|
505
|
+
"""
|
|
506
|
+
Extrai entidades do frontmatter de uma memória.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
memory_id: ID da memória
|
|
510
|
+
frontmatter: Dicionário com metadados
|
|
511
|
+
project: Nome do projeto
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
Lista de IDs de entidades criadas
|
|
515
|
+
"""
|
|
516
|
+
entity_ids = []
|
|
517
|
+
|
|
518
|
+
# Type como entidade
|
|
519
|
+
if "type" in frontmatter:
|
|
520
|
+
eid = self.insert_entity(
|
|
521
|
+
memory_id,
|
|
522
|
+
f"TYPE:{frontmatter['type']}",
|
|
523
|
+
"META",
|
|
524
|
+
confidence=1.0
|
|
525
|
+
)
|
|
526
|
+
entity_ids.append(eid)
|
|
527
|
+
|
|
528
|
+
# Project como entidade
|
|
529
|
+
if project:
|
|
530
|
+
eid = self.insert_entity(
|
|
531
|
+
memory_id,
|
|
532
|
+
project,
|
|
533
|
+
"PROJECT",
|
|
534
|
+
confidence=1.0
|
|
535
|
+
)
|
|
536
|
+
entity_ids.append(eid)
|
|
537
|
+
|
|
538
|
+
# Tags como entidades
|
|
539
|
+
if "tags" in frontmatter:
|
|
540
|
+
tags = frontmatter.get("tags", "")
|
|
541
|
+
if isinstance(tags, str):
|
|
542
|
+
for tag in [t.strip() for t in tags.split(",") if t.strip()]:
|
|
543
|
+
eid = self.insert_entity(
|
|
544
|
+
memory_id,
|
|
545
|
+
f"TAG:{tag}",
|
|
546
|
+
"TAG",
|
|
547
|
+
confidence=1.0
|
|
548
|
+
)
|
|
549
|
+
entity_ids.append(eid)
|
|
550
|
+
elif isinstance(tags, list):
|
|
551
|
+
for tag in tags:
|
|
552
|
+
eid = self.insert_entity(
|
|
553
|
+
memory_id,
|
|
554
|
+
f"TAG:{tag}",
|
|
555
|
+
"TAG",
|
|
556
|
+
confidence=1.0
|
|
557
|
+
)
|
|
558
|
+
entity_ids.append(eid)
|
|
559
|
+
|
|
560
|
+
return entity_ids
|
|
561
|
+
|
|
562
|
+
def extract_from_content(
|
|
563
|
+
self,
|
|
564
|
+
memory_id: str,
|
|
565
|
+
content: str,
|
|
566
|
+
use_spacy: bool = True
|
|
567
|
+
) -> List[str]:
|
|
568
|
+
"""
|
|
569
|
+
Extrai entidades do conteúdo usando spaCy NER.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
memory_id: ID da memória
|
|
573
|
+
content: Conteúdo de texto
|
|
574
|
+
use_spacy: Usar spaCy (padrão: True)
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
Lista de IDs de entidades criadas
|
|
578
|
+
"""
|
|
579
|
+
# Verifica cache - só processa se conteúdo mudou
|
|
580
|
+
cached_hash = self.get_cached_hash(memory_id)
|
|
581
|
+
current_hash = self._compute_hash(content)
|
|
582
|
+
|
|
583
|
+
if cached_hash == current_hash:
|
|
584
|
+
# Conteúdo igual, verifica se já tem entidades
|
|
585
|
+
existing = self.get_entities_by_memory(memory_id)
|
|
586
|
+
if existing:
|
|
587
|
+
return [] # Já processado, sem mudanças
|
|
588
|
+
|
|
589
|
+
# Conteúdo novo ou mudou - remove entidades antigas e reprocessa
|
|
590
|
+
self.delete_entities_by_memory(memory_id)
|
|
591
|
+
|
|
592
|
+
entity_ids = []
|
|
593
|
+
|
|
594
|
+
if use_spacy:
|
|
595
|
+
try:
|
|
596
|
+
import spacy
|
|
597
|
+
from spacy import Language
|
|
598
|
+
|
|
599
|
+
# Carrega modelo (download na primeira vez)
|
|
600
|
+
try:
|
|
601
|
+
nlp: Language = spacy.load("pt_core_news_sm")
|
|
602
|
+
except OSError:
|
|
603
|
+
# Modelo não instalado - tenta instalar
|
|
604
|
+
import subprocess
|
|
605
|
+
subprocess.run(
|
|
606
|
+
["python", "-m", "spacy", "download", "pt_core_news_sm"],
|
|
607
|
+
capture_output=True
|
|
608
|
+
)
|
|
609
|
+
nlp = spacy.load("pt_core_news_sm")
|
|
610
|
+
|
|
611
|
+
# Processa texto
|
|
612
|
+
doc = nlp(content[:5000]) # Limita a 5000 chars para performance
|
|
613
|
+
|
|
614
|
+
# Mapeia labels do spaCy para nossos tipos
|
|
615
|
+
label_map = {
|
|
616
|
+
"ORG": "ORG",
|
|
617
|
+
"PERSON": "PERSON",
|
|
618
|
+
"GPE": "LOC",
|
|
619
|
+
"LOC": "LOC",
|
|
620
|
+
"PRODUCT": "PRODUCT",
|
|
621
|
+
"EVENT": "EVENT",
|
|
622
|
+
"WORK_OF_ART": "PRODUCT",
|
|
623
|
+
"LAW": "PRODUCT",
|
|
624
|
+
"LANGUAGE": "TECH",
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
for ent in doc.ents:
|
|
628
|
+
if ent.label_ in label_map:
|
|
629
|
+
entity_type = label_map[ent.label_]
|
|
630
|
+
confidence = float(ent._.get("nerd_score", 0.8)) if hasattr(ent._, "nerd_score") else 0.8
|
|
631
|
+
|
|
632
|
+
eid = self.insert_entity(
|
|
633
|
+
memory_id,
|
|
634
|
+
ent.text,
|
|
635
|
+
entity_type,
|
|
636
|
+
confidence=confidence,
|
|
637
|
+
span_start=ent.start_char,
|
|
638
|
+
span_end=ent.end_char,
|
|
639
|
+
context_snippet=content[max(0, ent.start_char - 25):ent.end_char + 25][:50]
|
|
640
|
+
)
|
|
641
|
+
entity_ids.append(eid)
|
|
642
|
+
|
|
643
|
+
# Atualiza cache após processamento
|
|
644
|
+
self.update_cache(memory_id, content, len(entity_ids))
|
|
645
|
+
|
|
646
|
+
except ImportError:
|
|
647
|
+
# spaCy não disponível - usa fallback simples
|
|
648
|
+
entity_ids.extend(self._extract_entities_fallback(memory_id, content))
|
|
649
|
+
# Atualiza cache após processamento
|
|
650
|
+
self.update_cache(memory_id, content, len(entity_ids))
|
|
651
|
+
except Exception:
|
|
652
|
+
# Falha silenciosa - não quebra o fluxo
|
|
653
|
+
pass
|
|
654
|
+
else:
|
|
655
|
+
entity_ids.extend(self._extract_entities_fallback(memory_id, content))
|
|
656
|
+
# Atualiza cache após processamento
|
|
657
|
+
self.update_cache(memory_id, content, len(entity_ids))
|
|
658
|
+
|
|
659
|
+
return entity_ids
|
|
660
|
+
|
|
661
|
+
def _extract_entities_fallback(
|
|
662
|
+
self,
|
|
663
|
+
memory_id: str,
|
|
664
|
+
content: str
|
|
665
|
+
) -> List[str]:
|
|
666
|
+
"""
|
|
667
|
+
Fallback sem spaCy - usa heurísticas simples.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
memory_id: ID da memória
|
|
671
|
+
content: Conteúdo de texto
|
|
672
|
+
|
|
673
|
+
Returns:
|
|
674
|
+
Lista de IDs de entidades criadas
|
|
675
|
+
"""
|
|
676
|
+
entity_ids = []
|
|
677
|
+
|
|
678
|
+
# Palavras capitalizadas (prováveis nomes próprios/ORGs)
|
|
679
|
+
capitalized = re.findall(r'\b[A-Z][a-zA-Z]{2,}\b', content[:2000])
|
|
680
|
+
|
|
681
|
+
# Filtra termos comuns que não são entidades
|
|
682
|
+
common_words = {
|
|
683
|
+
"The", "This", "That", "These", "Those", "What", "When", "Where", "Why", "How",
|
|
684
|
+
"Para", "Com", "Por", "Em", "De", "Do", "Da", "Dos", "Das", "Uma", "Um",
|
|
685
|
+
"Como", "Quando", "Onde", "Qual", "Quais", "Quem", "Sobre"
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
entities = set(e for e in capitalized if e not in common_words)
|
|
689
|
+
|
|
690
|
+
for entity in list(entities)[:20]: # Limita a 20 entidades
|
|
691
|
+
eid = self.insert_entity(
|
|
692
|
+
memory_id,
|
|
693
|
+
entity,
|
|
694
|
+
"ORG" if entity[0].isupper() else "PERSON",
|
|
695
|
+
confidence=0.5, # Baixa confiança sem spaCy
|
|
696
|
+
context_snippet=entity
|
|
697
|
+
)
|
|
698
|
+
entity_ids.append(eid)
|
|
699
|
+
|
|
700
|
+
return entity_ids
|
|
701
|
+
|
|
702
|
+
# ========================================================================
|
|
703
|
+
# CACHE DE HASH (para evitar reprocessamento)
|
|
704
|
+
# ========================================================================
|
|
705
|
+
|
|
706
|
+
def _compute_hash(self, content: str) -> str:
|
|
707
|
+
"""
|
|
708
|
+
Computa hash do conteúdo.
|
|
709
|
+
|
|
710
|
+
Args:
|
|
711
|
+
content: Conteúdo de texto
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
Hash SHA256 (primeiros 16 chars)
|
|
715
|
+
"""
|
|
716
|
+
import hashlib
|
|
717
|
+
return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
|
|
718
|
+
|
|
719
|
+
def get_cached_hash(self, memory_id: str) -> Optional[str]:
|
|
720
|
+
"""
|
|
721
|
+
Obtém hash em cache de uma memória.
|
|
722
|
+
|
|
723
|
+
Args:
|
|
724
|
+
memory_id: ID da memória
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
Hash armazenado ou None
|
|
728
|
+
"""
|
|
729
|
+
conn = self._connect()
|
|
730
|
+
cursor = conn.execute(
|
|
731
|
+
"SELECT content_hash FROM entity_cache WHERE memory_id = ?",
|
|
732
|
+
(memory_id,)
|
|
733
|
+
)
|
|
734
|
+
row = cursor.fetchone()
|
|
735
|
+
conn.close()
|
|
736
|
+
return row["content_hash"] if row else None
|
|
737
|
+
|
|
738
|
+
def is_content_changed(self, memory_id: str, content: str) -> bool:
|
|
739
|
+
"""
|
|
740
|
+
Verifica se conteúdo mudou desde último processamento.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
memory_id: ID da memória
|
|
744
|
+
content: Conteúdo de texto
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
True se mudou, False se igual
|
|
748
|
+
"""
|
|
749
|
+
current_hash = self._compute_hash(content)
|
|
750
|
+
cached_hash = self.get_cached_hash(memory_id)
|
|
751
|
+
return cached_hash != current_hash
|
|
752
|
+
|
|
753
|
+
def update_cache(self, memory_id: str, content: str, entity_count: int) -> None:
|
|
754
|
+
"""
|
|
755
|
+
Atualiza cache de hash após processamento.
|
|
756
|
+
|
|
757
|
+
Args:
|
|
758
|
+
memory_id: ID da memória
|
|
759
|
+
content: Conteúdo de texto
|
|
760
|
+
entity_count: Número de entidades extraídas
|
|
761
|
+
"""
|
|
762
|
+
content_hash = self._compute_hash(content)
|
|
763
|
+
|
|
764
|
+
conn = self._connect()
|
|
765
|
+
conn.execute("""
|
|
766
|
+
INSERT OR REPLACE INTO entity_cache
|
|
767
|
+
(memory_id, content_hash, processed_at, entity_count)
|
|
768
|
+
VALUES (?, ?, datetime('now'), ?)
|
|
769
|
+
""", (memory_id, content_hash, entity_count))
|
|
770
|
+
conn.commit()
|
|
771
|
+
conn.close()
|
|
772
|
+
|
|
773
|
+
def clear_cache(self, memory_id: str) -> None:
|
|
774
|
+
"""
|
|
775
|
+
Limpa cache de uma memória.
|
|
776
|
+
|
|
777
|
+
Args:
|
|
778
|
+
memory_id: ID da memória
|
|
779
|
+
"""
|
|
780
|
+
conn = self._connect()
|
|
781
|
+
conn.execute(
|
|
782
|
+
"DELETE FROM entity_cache WHERE memory_id = ?",
|
|
783
|
+
(memory_id,)
|
|
784
|
+
)
|
|
785
|
+
conn.commit()
|
|
786
|
+
conn.close()
|
|
787
|
+
|
|
788
|
+
# ========================================================================
|
|
789
|
+
# ESTATÍSTICAS
|
|
790
|
+
# ========================================================================
|
|
791
|
+
|
|
792
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
793
|
+
"""
|
|
794
|
+
Obtém estatísticas do grafo.
|
|
795
|
+
|
|
796
|
+
Returns:
|
|
797
|
+
Dicionário com estatísticas
|
|
798
|
+
"""
|
|
799
|
+
conn = self._connect()
|
|
800
|
+
|
|
801
|
+
total_entities = conn.execute(
|
|
802
|
+
"SELECT COUNT(*) FROM entities"
|
|
803
|
+
).fetchone()[0]
|
|
804
|
+
|
|
805
|
+
total_relationships = conn.execute(
|
|
806
|
+
"SELECT COUNT(*) FROM entity_relationships"
|
|
807
|
+
).fetchone()[0]
|
|
808
|
+
|
|
809
|
+
by_type = conn.execute(
|
|
810
|
+
"SELECT entity_type, COUNT(*) FROM entities GROUP BY entity_type"
|
|
811
|
+
).fetchall()
|
|
812
|
+
|
|
813
|
+
conn.close()
|
|
814
|
+
|
|
815
|
+
return {
|
|
816
|
+
"total_entities": total_entities,
|
|
817
|
+
"total_relationships": total_relationships,
|
|
818
|
+
"by_type": dict(by_type),
|
|
819
|
+
"avg_relationships_per_entity": (
|
|
820
|
+
total_relationships / total_entities if total_entities > 0 else 0
|
|
821
|
+
)
|
|
822
|
+
}
|
package/src/index/queries.py
CHANGED
|
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
|
|
6
6
|
|
|
7
7
|
from .metadata_db import MetadataDB
|
|
8
8
|
from .embeddings_db import EmbeddingsDB
|
|
9
|
+
from .entities_db import EntitiesDB
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
@dataclass
|
|
@@ -16,7 +17,7 @@ class QueryResult:
|
|
|
16
17
|
project: str
|
|
17
18
|
title: str
|
|
18
19
|
score: float
|
|
19
|
-
source: str # 'fts', 'semantic', 'metadata'
|
|
20
|
+
source: str # 'fts', 'semantic', 'metadata', 'graph'
|
|
20
21
|
metadata: Dict[str, Any] = None
|
|
21
22
|
|
|
22
23
|
|
|
@@ -24,16 +25,18 @@ class QueryEngine:
|
|
|
24
25
|
"""
|
|
25
26
|
Engine de consultas híbridas.
|
|
26
27
|
|
|
27
|
-
Combina
|
|
28
|
+
Combina quatro tipos de busca:
|
|
28
29
|
- Metadata: filtros estruturados (projeto, tipo, tags)
|
|
29
30
|
- FTS: busca full-text no conteúdo
|
|
30
31
|
- Semantic: busca por similaridade de embeddings
|
|
32
|
+
- Graph: busca por entidades e relacionamentos
|
|
31
33
|
"""
|
|
32
34
|
|
|
33
35
|
def __init__(
|
|
34
36
|
self,
|
|
35
37
|
metadata_db: MetadataDB,
|
|
36
|
-
embeddings_db: EmbeddingsDB
|
|
38
|
+
embeddings_db: EmbeddingsDB,
|
|
39
|
+
entities_db: Optional[EntitiesDB] = None
|
|
37
40
|
):
|
|
38
41
|
"""
|
|
39
42
|
Inicializa o QueryEngine.
|
|
@@ -41,9 +44,11 @@ class QueryEngine:
|
|
|
41
44
|
Args:
|
|
42
45
|
metadata_db: Instância do MetadataDB
|
|
43
46
|
embeddings_db: Instância do EmbeddingsDB
|
|
47
|
+
entities_db: Instância do EntitiesDB (opcional)
|
|
44
48
|
"""
|
|
45
49
|
self.metadata_db = metadata_db
|
|
46
50
|
self.embeddings_db = embeddings_db
|
|
51
|
+
self.entities_db = entities_db
|
|
47
52
|
|
|
48
53
|
def search(
|
|
49
54
|
self,
|
|
@@ -53,11 +58,13 @@ class QueryEngine:
|
|
|
53
58
|
limit: int = 10,
|
|
54
59
|
use_fts: bool = True,
|
|
55
60
|
use_semantic: bool = True,
|
|
56
|
-
|
|
57
|
-
|
|
61
|
+
use_graph: bool = True,
|
|
62
|
+
fts_weight: float = 0.3,
|
|
63
|
+
semantic_weight: float = 0.5,
|
|
64
|
+
graph_weight: float = 0.2
|
|
58
65
|
) -> List[QueryResult]:
|
|
59
66
|
"""
|
|
60
|
-
Busca híbrida combinando FTS e
|
|
67
|
+
Busca híbrida combinando FTS, semantic e graph.
|
|
61
68
|
|
|
62
69
|
Args:
|
|
63
70
|
query: Texto de busca
|
|
@@ -66,8 +73,10 @@ class QueryEngine:
|
|
|
66
73
|
limit: Limite de resultados
|
|
67
74
|
use_fts: Usar busca FTS
|
|
68
75
|
use_semantic: Usar busca semantic
|
|
76
|
+
use_graph: Usar busca por graph
|
|
69
77
|
fts_weight: Peso da busca FTS
|
|
70
78
|
semantic_weight: Peso da busca semantic
|
|
79
|
+
graph_weight: Peso da busca graph
|
|
71
80
|
|
|
72
81
|
Returns:
|
|
73
82
|
Lista de resultados ordenados por relevância
|
|
@@ -78,6 +87,7 @@ class QueryEngine:
|
|
|
78
87
|
if use_fts:
|
|
79
88
|
fts_results = self._search_fts(query, project, mem_type, limit)
|
|
80
89
|
for r in fts_results:
|
|
90
|
+
r.score *= fts_weight # Aplica peso FTS desde o início
|
|
81
91
|
results[r.memory_id] = r
|
|
82
92
|
|
|
83
93
|
# Busca Semantic
|
|
@@ -85,12 +95,9 @@ class QueryEngine:
|
|
|
85
95
|
semantic_results = self._search_semantic(query, project, limit)
|
|
86
96
|
for r in semantic_results:
|
|
87
97
|
if r.memory_id in results:
|
|
88
|
-
# Combina scores
|
|
98
|
+
# Combina scores: média ponderada (FTS já tem peso aplicado)
|
|
89
99
|
existing = results[r.memory_id]
|
|
90
|
-
combined_score = (
|
|
91
|
-
existing.score * fts_weight +
|
|
92
|
-
r.score * semantic_weight
|
|
93
|
-
)
|
|
100
|
+
combined_score = existing.score + (r.score * semantic_weight)
|
|
94
101
|
results[r.memory_id] = QueryResult(
|
|
95
102
|
memory_id=r.memory_id,
|
|
96
103
|
type=r.type,
|
|
@@ -104,6 +111,27 @@ class QueryEngine:
|
|
|
104
111
|
r.score *= semantic_weight
|
|
105
112
|
results[r.memory_id] = r
|
|
106
113
|
|
|
114
|
+
# Busca por Graph (entidades)
|
|
115
|
+
if use_graph and self.entities_db:
|
|
116
|
+
graph_results = self._search_by_graph(query, limit)
|
|
117
|
+
for r in graph_results:
|
|
118
|
+
if r.memory_id in results:
|
|
119
|
+
# Combina scores: soma ponderada (scores anteriores já têm peso)
|
|
120
|
+
existing = results[r.memory_id]
|
|
121
|
+
combined_score = existing.score + (r.score * graph_weight)
|
|
122
|
+
results[r.memory_id] = QueryResult(
|
|
123
|
+
memory_id=r.memory_id,
|
|
124
|
+
type=r.type,
|
|
125
|
+
project=r.project,
|
|
126
|
+
title=r.title,
|
|
127
|
+
score=combined_score,
|
|
128
|
+
source="hybrid",
|
|
129
|
+
metadata=r.metadata
|
|
130
|
+
)
|
|
131
|
+
else:
|
|
132
|
+
r.score *= graph_weight
|
|
133
|
+
results[r.memory_id] = r
|
|
134
|
+
|
|
107
135
|
# Filtra por tipo se especificado
|
|
108
136
|
if mem_type:
|
|
109
137
|
results = {
|
|
@@ -198,6 +226,48 @@ class QueryEngine:
|
|
|
198
226
|
|
|
199
227
|
return results
|
|
200
228
|
|
|
229
|
+
def _search_by_graph(
|
|
230
|
+
self,
|
|
231
|
+
query: str,
|
|
232
|
+
limit: int
|
|
233
|
+
) -> List[QueryResult]:
|
|
234
|
+
"""
|
|
235
|
+
Busca por entidades no grafo.
|
|
236
|
+
|
|
237
|
+
Extrai entidades da query e retorna memórias conectadas.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
query: Texto de busca
|
|
241
|
+
limit: Limite de resultados
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Lista de resultados com score de grafo
|
|
245
|
+
"""
|
|
246
|
+
if not self.entities_db:
|
|
247
|
+
return []
|
|
248
|
+
|
|
249
|
+
graph_results = self.entities_db.search_by_query(query, limit * 2)
|
|
250
|
+
|
|
251
|
+
results = []
|
|
252
|
+
for item in graph_results:
|
|
253
|
+
# Busca metadados adicionais
|
|
254
|
+
memory = self.metadata_db.get_by_id(item["memory_id"])
|
|
255
|
+
|
|
256
|
+
results.append(QueryResult(
|
|
257
|
+
memory_id=item["memory_id"],
|
|
258
|
+
type=memory.get("type", "unknown") if memory else "unknown",
|
|
259
|
+
project=memory.get("project", "unknown") if memory else "unknown",
|
|
260
|
+
title=memory.get("title", item["memory_id"]) if memory else item["memory_id"],
|
|
261
|
+
score=item["score"],
|
|
262
|
+
source="graph",
|
|
263
|
+
metadata={
|
|
264
|
+
"matched_entities": item.get("matched_entities", []),
|
|
265
|
+
"graph_score": item["score"]
|
|
266
|
+
}
|
|
267
|
+
))
|
|
268
|
+
|
|
269
|
+
return results
|
|
270
|
+
|
|
201
271
|
def search_by_metadata(
|
|
202
272
|
self,
|
|
203
273
|
project: Optional[str] = None,
|
package/src/mcp/server.py
CHANGED
|
@@ -26,6 +26,7 @@ from src.consolidation.extractor import Extractor
|
|
|
26
26
|
from src.consolidation.promoter import Promoter
|
|
27
27
|
from src.index.metadata_db import MetadataDB
|
|
28
28
|
from src.index.embeddings_db import EmbeddingsDB
|
|
29
|
+
from src.index.entities_db import EntitiesDB
|
|
29
30
|
from src.index.queries import QueryEngine
|
|
30
31
|
from src.hooks.custom_loader import HooksLoader, HookRunner
|
|
31
32
|
from src.diff.memory_diff import MemoryDiff
|
|
@@ -81,10 +82,15 @@ class CerebroMCP:
|
|
|
81
82
|
|
|
82
83
|
self.metadata_db = MetadataDB(self.cerebro_path / "index" / "metadata.db")
|
|
83
84
|
self.embeddings_db = EmbeddingsDB(self.cerebro_path / "index" / "embeddings.db")
|
|
84
|
-
self.
|
|
85
|
+
self.entities_db = EntitiesDB(self.cerebro_path / "index" / "entities.db")
|
|
86
|
+
self.query_engine = QueryEngine(self.metadata_db, self.embeddings_db, self.entities_db)
|
|
85
87
|
|
|
86
88
|
self.extractor = Extractor(self.raw_storage, self.working_storage)
|
|
87
|
-
self.promoter = Promoter(
|
|
89
|
+
self.promoter = Promoter(
|
|
90
|
+
self.working_storage,
|
|
91
|
+
self.official_storage,
|
|
92
|
+
self.cerebro_path / "index" / "entities.db"
|
|
93
|
+
)
|
|
88
94
|
|
|
89
95
|
self.memory_view = MemoryView(
|
|
90
96
|
self.cerebro_path,
|
|
@@ -340,6 +346,31 @@ class CerebroMCP:
|
|
|
340
346
|
}
|
|
341
347
|
}
|
|
342
348
|
}
|
|
349
|
+
),
|
|
350
|
+
Tool(
|
|
351
|
+
name="cerebro_graph",
|
|
352
|
+
description="Explora grafo de entidades - mostra conexões entre projetos, tecnologias, pessoas e decisões",
|
|
353
|
+
inputSchema={
|
|
354
|
+
"type": "object",
|
|
355
|
+
"properties": {
|
|
356
|
+
"entity": {
|
|
357
|
+
"type": "string",
|
|
358
|
+
"description": "Nome da entidade para iniciar traversal (ex: 'MedicsPro', 'JWT', 'autenticação')"
|
|
359
|
+
},
|
|
360
|
+
"depth": {
|
|
361
|
+
"type": "integer",
|
|
362
|
+
"description": "Profundidade máxima do traversal (1-3, padrão: 2)",
|
|
363
|
+
"default": 2
|
|
364
|
+
},
|
|
365
|
+
"types": {
|
|
366
|
+
"type": "array",
|
|
367
|
+
"items": {"type": "string"},
|
|
368
|
+
"description": "Filtrar por tipos de entidade (ex: ['ORG', 'TECH'])",
|
|
369
|
+
"default": ["ORG", "TECH", "PERSON", "PROJECT"]
|
|
370
|
+
}
|
|
371
|
+
},
|
|
372
|
+
"required": ["entity"]
|
|
373
|
+
}
|
|
343
374
|
)
|
|
344
375
|
]
|
|
345
376
|
|
|
@@ -377,6 +408,8 @@ class CerebroMCP:
|
|
|
377
408
|
result = self._gc(arguments)
|
|
378
409
|
elif name == "cerebro_capture_memory":
|
|
379
410
|
result = self._capture_memory(arguments)
|
|
411
|
+
elif name == "cerebro_graph":
|
|
412
|
+
result = self._cerebro_graph(arguments)
|
|
380
413
|
else:
|
|
381
414
|
return [TextContent(type="text", text=f"Ferramenta desconhecida: {name}")]
|
|
382
415
|
|
|
@@ -499,7 +532,8 @@ class CerebroMCP:
|
|
|
499
532
|
"",
|
|
500
533
|
"Índice:",
|
|
501
534
|
f" Metadata DB: {self.cerebro_path / 'index' / 'metadata.db'}",
|
|
502
|
-
f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}"
|
|
535
|
+
f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}",
|
|
536
|
+
f" Entities DB: {self.cerebro_path / 'index' / 'entities.db'}"
|
|
503
537
|
]
|
|
504
538
|
|
|
505
539
|
return "\n".join(lines)
|
|
@@ -679,8 +713,14 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
|
|
|
679
713
|
|
|
680
714
|
desc_match = re.search(r'description:\s*(.*)', content)
|
|
681
715
|
type_match = re.search(r'type:\s*(.*)', content)
|
|
716
|
+
project_match = re.search(r'project:\s*(.*)', content)
|
|
717
|
+
tags_match = re.search(r'tags:\s*(.*)', content)
|
|
718
|
+
|
|
682
719
|
desc = desc_match.group(1).strip() if desc_match else "sem descrição"
|
|
683
720
|
m_type = type_match.group(1).strip() if type_match else "project"
|
|
721
|
+
project = project_match.group(1).strip() if project_match else "unknown"
|
|
722
|
+
tags = tags_match.group(1).strip() if tags_match else ""
|
|
723
|
+
|
|
684
724
|
ts = datetime.now().strftime("%Y-%m-%d")
|
|
685
725
|
entry = f"- [{m_type}] {mem_name}.md ({ts}): {desc}\n"
|
|
686
726
|
|
|
@@ -693,6 +733,29 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
|
|
|
693
733
|
else:
|
|
694
734
|
index_path.write_text(f"# Memórias do Projeto\n\n{entry}", encoding="utf-8")
|
|
695
735
|
|
|
736
|
+
# BUG FIX: Registrar entidades no grafo (frontmatter + conteúdo)
|
|
737
|
+
if self.entities_db:
|
|
738
|
+
import yaml
|
|
739
|
+
frontmatter_match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
|
|
740
|
+
if frontmatter_match:
|
|
741
|
+
try:
|
|
742
|
+
frontmatter = yaml.safe_load(frontmatter_match.group(1))
|
|
743
|
+
# Extrai entidades do frontmatter
|
|
744
|
+
self.entities_db.extract_from_frontmatter(
|
|
745
|
+
memory_id=mem_name,
|
|
746
|
+
frontmatter=frontmatter or {},
|
|
747
|
+
project=project
|
|
748
|
+
)
|
|
749
|
+
# Extrai entidades do conteúdo (spaCy NER)
|
|
750
|
+
body_content = frontmatter_match.group(2)
|
|
751
|
+
self.entities_db.extract_from_content(
|
|
752
|
+
memory_id=mem_name,
|
|
753
|
+
content=body_content,
|
|
754
|
+
use_spacy=True
|
|
755
|
+
)
|
|
756
|
+
except Exception as e:
|
|
757
|
+
pass # Falha silenciosa se frontmatter inválido
|
|
758
|
+
|
|
696
759
|
return f"✅ Memória '{mem_name}' salva em {file_path}"
|
|
697
760
|
|
|
698
761
|
def _remember(self, args: Dict[str, Any]) -> str:
|
|
@@ -717,6 +780,86 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
|
|
|
717
780
|
)
|
|
718
781
|
return gc.generate_gc_report(results)
|
|
719
782
|
|
|
783
|
+
def _cerebro_graph(self, args: Dict[str, Any]) -> str:
|
|
784
|
+
"""Explora grafo de entidades"""
|
|
785
|
+
entity = args.get("entity")
|
|
786
|
+
if not entity:
|
|
787
|
+
return "Erro: 'entity' é obrigatório para cerebro_graph"
|
|
788
|
+
|
|
789
|
+
depth = args.get("depth", 2)
|
|
790
|
+
entity_types = args.get("types", ["ORG", "TECH", "PERSON", "PROJECT"])
|
|
791
|
+
|
|
792
|
+
# Limita profundidade máxima para evitar traversal muito grande
|
|
793
|
+
depth = min(depth, 3)
|
|
794
|
+
|
|
795
|
+
nodes, edges = self.entities_db.traverse(
|
|
796
|
+
start_entity=entity,
|
|
797
|
+
depth=depth,
|
|
798
|
+
entity_types=entity_types,
|
|
799
|
+
max_nodes=50
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
if not nodes:
|
|
803
|
+
return f"Nenhuma entidade encontrada para '{entity}'"
|
|
804
|
+
|
|
805
|
+
# Formata grafo como árvore
|
|
806
|
+
return self._format_graph(nodes, edges, entity)
|
|
807
|
+
|
|
808
|
+
def _format_graph(
|
|
809
|
+
self,
|
|
810
|
+
nodes: List[Dict[str, Any]],
|
|
811
|
+
edges: List[Dict[str, Any]],
|
|
812
|
+
root_entity: str
|
|
813
|
+
) -> str:
|
|
814
|
+
"""Formata grafo como árvore visual"""
|
|
815
|
+
lines = [f"## Grafo de '{root_entity}'\n"]
|
|
816
|
+
lines.append(f"**{len(nodes)}** entidades encontradas, **{len(edges)}** conexões\n")
|
|
817
|
+
|
|
818
|
+
# Constroi adjacency list
|
|
819
|
+
adj: Dict[str, List[Dict[str, Any]]] = {}
|
|
820
|
+
for edge in edges:
|
|
821
|
+
source = edge["source"].lower()
|
|
822
|
+
if source not in adj:
|
|
823
|
+
adj[source] = []
|
|
824
|
+
adj[source].append(edge)
|
|
825
|
+
|
|
826
|
+
# BFS para imprimir árvore
|
|
827
|
+
visited = set()
|
|
828
|
+
queue = [(root_entity.lower(), 0)]
|
|
829
|
+
|
|
830
|
+
while queue:
|
|
831
|
+
entity_name, depth = queue.pop(0)
|
|
832
|
+
|
|
833
|
+
if entity_name in visited:
|
|
834
|
+
continue
|
|
835
|
+
visited.add(entity_name)
|
|
836
|
+
|
|
837
|
+
# Encontra nó correspondente
|
|
838
|
+
node = next((n for n in nodes if n["name"].lower() == entity_name), None)
|
|
839
|
+
if not node:
|
|
840
|
+
continue
|
|
841
|
+
|
|
842
|
+
# Imprime nó
|
|
843
|
+
prefix = " " * depth
|
|
844
|
+
connector = "├─ " if depth > 0 else ""
|
|
845
|
+
lines.append(f"{prefix}{connector}{node['name']} ({node['type']})")
|
|
846
|
+
|
|
847
|
+
# Adiciona filhos na fila
|
|
848
|
+
if depth < 3:
|
|
849
|
+
children = adj.get(entity_name, [])
|
|
850
|
+
for child in children:
|
|
851
|
+
child_name = child["target"].lower()
|
|
852
|
+
if child_name not in visited:
|
|
853
|
+
queue.append((child_name, depth + 1))
|
|
854
|
+
|
|
855
|
+
# Lista todas as arestas
|
|
856
|
+
if edges:
|
|
857
|
+
lines.append("\n## Conexões")
|
|
858
|
+
for edge in edges:
|
|
859
|
+
lines.append(f"- {edge['source']} → {edge['target']} ({edge['type']})")
|
|
860
|
+
|
|
861
|
+
return "\n".join(lines)
|
|
862
|
+
|
|
720
863
|
|
|
721
864
|
async def main():
|
|
722
865
|
"""Entry point do MCP Server"""
|