ocerebro 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ocerebro",
3
- "version": "0.2.3",
3
+ "version": "0.3.1",
4
4
  "description": "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)",
5
5
  "main": "bin/ocerebro.js",
6
6
  "bin": {
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ocerebro"
7
- version = "0.2.3"
7
+ version = "0.3.1"
8
8
  description = "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -37,6 +37,7 @@ dependencies = [
37
37
  "sentence-transformers>=2.2.0",
38
38
  "mcp>=1.0.0",
39
39
  "anthropic>=0.40.0",
40
+ "spacy>=3.5.0",
40
41
  ]
41
42
 
42
43
  [project.optional-dependencies]
@@ -37,7 +37,8 @@ class Promoter:
37
37
  def __init__(
38
38
  self,
39
39
  working_storage: YAMLStorage,
40
- official_storage: MarkdownStorage
40
+ official_storage: MarkdownStorage,
41
+ entities_db_path: Optional[Path] = None
41
42
  ):
42
43
  """
43
44
  Inicializa o Promoter.
@@ -45,9 +46,12 @@ class Promoter:
45
46
  Args:
46
47
  working_storage: Instância do YAMLStorage
47
48
  official_storage: Instância do MarkdownStorage
49
+ entities_db_path: Path para o EntitiesDB (opcional)
48
50
  """
49
51
  self.working_storage = working_storage
50
52
  self.official_storage = official_storage
53
+ self.entities_db_path = entities_db_path
54
+ self._entities_db = None
51
55
 
52
56
  def promote_session(
53
57
  self,
@@ -223,6 +227,10 @@ class Promoter:
223
227
  content=content
224
228
  )
225
229
 
230
+ # Extrai entidades do frontmatter E do conteúdo
231
+ self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
232
+ self._extract_entities_from_content(draft_id, content)
233
+
226
234
  return PromotionResult(
227
235
  success=True,
228
236
  source_type=draft.get("type", "session"),
@@ -300,6 +308,10 @@ class Promoter:
300
308
  content=body
301
309
  )
302
310
 
311
+ # Extrai entidades do frontmatter E do conteúdo
312
+ self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
313
+ self._extract_entities_from_content(draft_id, body)
314
+
303
315
  return PromotionResult(
304
316
  success=True,
305
317
  source_type=draft.get("type", "session"),
@@ -433,3 +445,61 @@ class Promoter:
433
445
  if existing:
434
446
  existing.update(draft)
435
447
  self.working_storage.write_feature(project, draft_id, existing)
448
+
449
+ @property
450
+ def entities_db(self):
451
+ """Lazy load do EntitiesDB"""
452
+ if self._entities_db is None and self.entities_db_path:
453
+ from src.index.entities_db import EntitiesDB
454
+ self._entities_db = EntitiesDB(self.entities_db_path)
455
+ return self._entities_db
456
+
457
+ def _extract_entities_from_frontmatter(
458
+ self,
459
+ memory_id: str,
460
+ frontmatter: Dict[str, Any],
461
+ project: str
462
+ ) -> List[str]:
463
+ """
464
+ Extrai entidades do frontmatter e registra no grafo.
465
+
466
+ Args:
467
+ memory_id: ID da memória
468
+ frontmatter: Dicionário com metadados
469
+ project: Nome do projeto
470
+
471
+ Returns:
472
+ Lista de IDs de entidades criadas
473
+ """
474
+ if not self.entities_db:
475
+ return []
476
+
477
+ return self.entities_db.extract_from_frontmatter(
478
+ memory_id=memory_id,
479
+ frontmatter=frontmatter,
480
+ project=project
481
+ )
482
+
483
+ def _extract_entities_from_content(
484
+ self,
485
+ memory_id: str,
486
+ content: str
487
+ ) -> List[str]:
488
+ """
489
+ Extrai entidades do conteúdo usando spaCy NER e registra no grafo.
490
+
491
+ Args:
492
+ memory_id: ID da memória
493
+ content: Conteúdo de texto
494
+
495
+ Returns:
496
+ Lista de IDs de entidades criadas
497
+ """
498
+ if not self.entities_db:
499
+ return []
500
+
501
+ return self.entities_db.extract_from_content(
502
+ memory_id=memory_id,
503
+ content=content,
504
+ use_spacy=True
505
+ )
@@ -0,0 +1,822 @@
1
+ """EntitiesDB: Grafo de experiência com entidades e relacionamentos"""
2
+
3
+ import sqlite3
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Tuple, Set
7
+ from datetime import datetime
8
+ from collections import deque
9
+
10
+
11
+ class EntitiesDB:
12
+ """
13
+ Banco de dados para grafo de experiência usando SQLite.
14
+
15
+ Armazena entidades extraídas de memórias (ORG, PERSON, TECH, etc)
16
+ e relacionamentos entre elas. Permite busca associativa por traversal.
17
+
18
+ Diferencial vs LightRAG:
19
+ - Extração local com spaCy NER (offline, grátis)
20
+ - Frontmatter como nós iniciais (sem LLM)
21
+ - Arestas implícitas por projeto/tags/type
22
+ """
23
+
24
+ def __init__(self, db_path: Path):
25
+ """
26
+ Inicializa o EntitiesDB.
27
+
28
+ Args:
29
+ db_path: Path para o arquivo do banco
30
+ """
31
+ self.db_path = db_path
32
+ db_path.parent.mkdir(parents=True, exist_ok=True)
33
+ self._init_schema()
34
+
35
+ def _connect(self) -> sqlite3.Connection:
36
+ """Cria conexão com o banco"""
37
+ conn = sqlite3.connect(self.db_path)
38
+ conn.row_factory = sqlite3.Row
39
+ conn.execute("PRAGMA journal_mode=WAL")
40
+ return conn
41
+
42
+ def _init_schema(self):
43
+ """Cria schema do banco"""
44
+ conn = self._connect()
45
+
46
+ # Tabela de entidades
47
+ conn.execute("""
48
+ CREATE TABLE IF NOT EXISTS entities (
49
+ id TEXT PRIMARY KEY,
50
+ memory_id TEXT,
51
+ entity_name TEXT,
52
+ entity_type TEXT,
53
+ confidence REAL DEFAULT 1.0,
54
+ span_start INTEGER,
55
+ span_end INTEGER,
56
+ context_snippet TEXT,
57
+ created_at TEXT DEFAULT (datetime('now')),
58
+ FOREIGN KEY (memory_id) REFERENCES memories(id)
59
+ )
60
+ """)
61
+
62
+ # Índices para performance
63
+ conn.execute("""
64
+ CREATE INDEX IF NOT EXISTS idx_entities_name
65
+ ON entities(entity_name)
66
+ """)
67
+
68
+ conn.execute("""
69
+ CREATE INDEX IF NOT EXISTS idx_entities_type
70
+ ON entities(entity_type)
71
+ """)
72
+
73
+ conn.execute("""
74
+ CREATE INDEX IF NOT EXISTS idx_entities_memory
75
+ ON entities(memory_id)
76
+ """)
77
+
78
+ # Tabela de cache de hash (para evitar reprocessamento)
79
+ conn.execute("""
80
+ CREATE TABLE IF NOT EXISTS entity_cache (
81
+ memory_id TEXT PRIMARY KEY,
82
+ content_hash TEXT,
83
+ processed_at TEXT DEFAULT (datetime('now')),
84
+ entity_count INTEGER DEFAULT 0
85
+ )
86
+ """)
87
+
88
+ conn.execute("""
89
+ CREATE INDEX IF NOT EXISTS idx_entity_cache_hash
90
+ ON entity_cache(content_hash)
91
+ """)
92
+
93
+ # Tabela de relacionamentos
94
+ conn.execute("""
95
+ CREATE TABLE IF NOT EXISTS entity_relationships (
96
+ id TEXT PRIMARY KEY,
97
+ source_entity TEXT,
98
+ target_entity TEXT,
99
+ relationship_type TEXT,
100
+ memory_id TEXT,
101
+ created_at TEXT DEFAULT (datetime('now')),
102
+ FOREIGN KEY (memory_id) REFERENCES memories(id)
103
+ )
104
+ """)
105
+
106
+ conn.execute("""
107
+ CREATE INDEX IF NOT EXISTS idx_relationships_source
108
+ ON entity_relationships(source_entity)
109
+ """)
110
+
111
+ conn.execute("""
112
+ CREATE INDEX IF NOT EXISTS idx_relationships_target
113
+ ON entity_relationships(target_entity)
114
+ """)
115
+
116
+ conn.commit()
117
+ conn.close()
118
+
119
+ # ========================================================================
120
+ # OPERAÇÕES DE ENTIDADES
121
+ # ========================================================================
122
+
123
+ def insert_entity(
124
+ self,
125
+ memory_id: str,
126
+ entity_name: str,
127
+ entity_type: str,
128
+ confidence: float = 1.0,
129
+ span_start: int = 0,
130
+ span_end: int = 0,
131
+ context_snippet: str = ""
132
+ ) -> str:
133
+ """
134
+ Insere uma entidade.
135
+
136
+ Args:
137
+ memory_id: ID da memória de origem
138
+ entity_name: Nome da entidade
139
+ entity_type: Tipo (ORG, PERSON, TECH, etc)
140
+ confidence: Confiança da extração (0-1)
141
+ span_start: Posição inicial no texto
142
+ span_end: Posição final no texto
143
+ context_snippet: Contexto ao redor da entidade
144
+
145
+ Returns:
146
+ ID da entidade
147
+ """
148
+ entity_id = f"ent_{memory_id}_{entity_name.lower().replace(' ', '_')}"
149
+
150
+ conn = self._connect()
151
+ conn.execute("""
152
+ INSERT OR REPLACE INTO entities
153
+ (id, memory_id, entity_name, entity_type, confidence, span_start, span_end, context_snippet)
154
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
155
+ """, (
156
+ entity_id,
157
+ memory_id,
158
+ entity_name,
159
+ entity_type,
160
+ confidence,
161
+ span_start,
162
+ span_end,
163
+ context_snippet
164
+ ))
165
+ conn.commit()
166
+ conn.close()
167
+
168
+ return entity_id
169
+
170
+ def get_entities_by_memory(self, memory_id: str) -> List[Dict[str, Any]]:
171
+ """
172
+ Obtém entidades de uma memória.
173
+
174
+ Args:
175
+ memory_id: ID da memória
176
+
177
+ Returns:
178
+ Lista de entidades
179
+ """
180
+ conn = self._connect()
181
+ cursor = conn.execute(
182
+ "SELECT * FROM entities WHERE memory_id = ?",
183
+ (memory_id,)
184
+ )
185
+ results = [dict(row) for row in cursor.fetchall()]
186
+ conn.close()
187
+ return results
188
+
189
+ def get_entities_by_name(self, entity_name: str) -> List[Dict[str, Any]]:
190
+ """
191
+ Busca entidades por nome (case-insensitive).
192
+
193
+ Args:
194
+ entity_name: Nome da entidade
195
+
196
+ Returns:
197
+ Lista de entidades
198
+ """
199
+ conn = self._connect()
200
+ cursor = conn.execute(
201
+ "SELECT * FROM entities WHERE LOWER(entity_name) = LOWER(?)",
202
+ (entity_name,)
203
+ )
204
+ results = [dict(row) for row in cursor.fetchall()]
205
+ conn.close()
206
+ return results
207
+
208
+ def delete_entities_by_memory(self, memory_id: str) -> int:
209
+ """
210
+ Remove entidades de uma memória.
211
+
212
+ Args:
213
+ memory_id: ID da memória
214
+
215
+ Returns:
216
+ Número de entidades removidas
217
+ """
218
+ conn = self._connect()
219
+ cursor = conn.execute(
220
+ "DELETE FROM entities WHERE memory_id = ?",
221
+ (memory_id,)
222
+ )
223
+ deleted = cursor.rowcount
224
+
225
+ # Remove relacionamentos também
226
+ conn.execute(
227
+ "DELETE FROM entity_relationships WHERE memory_id = ?",
228
+ (memory_id,)
229
+ )
230
+
231
+ conn.commit()
232
+ conn.close()
233
+ return deleted
234
+
235
+ # ========================================================================
236
+ # OPERAÇÕES DE RELACIONAMENTOS
237
+ # ========================================================================
238
+
239
+ def insert_relationship(
240
+ self,
241
+ source_entity: str,
242
+ target_entity: str,
243
+ relationship_type: str,
244
+ memory_id: str
245
+ ) -> str:
246
+ """
247
+ Insere relacionamento entre entidades.
248
+
249
+ Args:
250
+ source_entity: Nome da entidade origem
251
+ target_entity: Nome da entidade alvo
252
+ relationship_type: Tipo do relacionamento
253
+ memory_id: ID da memória de origem
254
+
255
+ Returns:
256
+ ID do relacionamento
257
+ """
258
+ rel_id = f"rel_{source_entity}_{target_entity}_{memory_id}"
259
+
260
+ conn = self._connect()
261
+ conn.execute("""
262
+ INSERT OR REPLACE INTO entity_relationships
263
+ (id, source_entity, target_entity, relationship_type, memory_id)
264
+ VALUES (?, ?, ?, ?, ?)
265
+ """, (rel_id, source_entity, target_entity, relationship_type, memory_id))
266
+ conn.commit()
267
+ conn.close()
268
+
269
+ return rel_id
270
+
271
+ def get_relationships(self, entity_name: str) -> List[Dict[str, Any]]:
272
+ """
273
+ Obtém relacionamentos de uma entidade.
274
+
275
+ Args:
276
+ entity_name: Nome da entidade
277
+
278
+ Returns:
279
+ Lista de relacionamentos (ida e volta)
280
+ """
281
+ conn = self._connect()
282
+
283
+ # Relacionamentos onde é origem
284
+ cursor = conn.execute("""
285
+ SELECT 'outgoing' as direction, r.*, e.entity_type as target_type
286
+ FROM entity_relationships r
287
+ LEFT JOIN entities e ON LOWER(e.entity_name) = LOWER(r.target_entity)
288
+ WHERE LOWER(r.source_entity) = LOWER(?)
289
+ """, (entity_name,))
290
+ outgoing = [dict(row) for row in cursor.fetchall()]
291
+
292
+ # Relacionamentos onde é alvo
293
+ cursor = conn.execute("""
294
+ SELECT 'incoming' as direction, r.*, e.entity_type as source_type
295
+ FROM entity_relationships r
296
+ LEFT JOIN entities e ON LOWER(e.entity_name) = LOWER(r.source_entity)
297
+ WHERE LOWER(r.target_entity) = LOWER(?)
298
+ """, (entity_name,))
299
+ incoming = [dict(row) for row in cursor.fetchall()]
300
+
301
+ conn.close()
302
+ return outgoing + incoming
303
+
304
+ # ========================================================================
305
+ # TRAVESSIA DO GRAFO (BFS)
306
+ # ========================================================================
307
+
308
+ def traverse(
309
+ self,
310
+ start_entity: str,
311
+ depth: int = 2,
312
+ entity_types: Optional[List[str]] = None,
313
+ max_nodes: int = 50
314
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
315
+ """
316
+ Faz traversal BFS a partir de uma entidade.
317
+
318
+ Args:
319
+ start_entity: Nome da entidade inicial
320
+ depth: Profundidade máxima (1-3 recomendado)
321
+ entity_types: Filtrar por tipos de entidade
322
+ max_nodes: Limite máximo de nós retornados
323
+
324
+ Returns:
325
+ Tupla (nodes, edges) para visualização do grafo
326
+ """
327
+ nodes: Dict[str, Dict[str, Any]] = {}
328
+ edges: List[Dict[str, Any]] = []
329
+
330
+ # Fila BFS: (entidade, profundidade)
331
+ queue: deque = deque([(start_entity, 0)])
332
+ visited: Set[str] = set()
333
+
334
+ while queue and len(nodes) < max_nodes:
335
+ entity_name, current_depth = queue.popleft()
336
+
337
+ if entity_name in visited:
338
+ continue
339
+ visited.add(entity_name)
340
+
341
+ # Busca entidade
342
+ entities = self.get_entities_by_name(entity_name)
343
+ if not entities:
344
+ continue
345
+
346
+ entity = entities[0] # Pega primeiro match
347
+
348
+ # Filtra por tipo se especificado
349
+ if entity_types and entity["entity_type"] not in entity_types:
350
+ continue
351
+
352
+ # Adiciona nó
353
+ node_key = entity["entity_name"].lower()
354
+ if node_key not in nodes:
355
+ nodes[node_key] = {
356
+ "name": entity["entity_name"],
357
+ "type": entity["entity_type"],
358
+ "depth": current_depth,
359
+ "memory_count": 1
360
+ }
361
+ else:
362
+ nodes[node_key]["memory_count"] += 1
363
+
364
+ # Se atingiu profundidade máxima, não expande
365
+ if current_depth >= depth:
366
+ continue
367
+
368
+ # Busca relacionamentos
369
+ relationships = self.get_relationships(entity_name)
370
+
371
+ for rel in relationships:
372
+ if rel["direction"] == "outgoing":
373
+ target = rel["target_entity"]
374
+ source = rel["source_entity"]
375
+ else:
376
+ target = rel["target_entity"]
377
+ source = rel["source_entity"]
378
+
379
+ # Adiciona aresta
380
+ edge_key = f"{source.lower()}_{target.lower()}"
381
+ if not any(e.get("key") == edge_key for e in edges):
382
+ edges.append({
383
+ "key": edge_key,
384
+ "source": source,
385
+ "target": target,
386
+ "type": rel["relationship_type"],
387
+ "memory_id": rel.get("memory_id")
388
+ })
389
+
390
+ # Adiciona próximo nó na fila
391
+ next_entity = target if rel["direction"] == "outgoing" else source
392
+ if next_entity not in visited:
393
+ queue.append((next_entity, current_depth + 1))
394
+
395
+ return list(nodes.values()), edges
396
+
397
+ # ========================================================================
398
+ # BUSCA POR ENTIDADES (integra com QueryEngine)
399
+ # ========================================================================
400
+
401
+ def search_by_query(self, query: str, limit: int = 20) -> List[Dict[str, Any]]:
402
+ """
403
+ Busca memórias por entidades relacionadas à query.
404
+
405
+ Extrai entidades da query e retorna memórias conectadas.
406
+
407
+ Args:
408
+ query: Texto de busca
409
+ limit: Limite de resultados
410
+
411
+ Returns:
412
+ Lista de memórias com score de grafo
413
+ """
414
+ # Tenta extrair entidades da query (palavras-chave)
415
+ query_entities = self._extract_query_entities(query)
416
+
417
+ if not query_entities:
418
+ return []
419
+
420
+ # Busca memórias conectadas às entidades
421
+ conn = self._connect()
422
+
423
+ results = {}
424
+ for entity_name in query_entities:
425
+ cursor = conn.execute("""
426
+ SELECT DISTINCT e.memory_id, e.entity_name, e.entity_type,
427
+ COUNT(*) as entity_count
428
+ FROM entities e
429
+ WHERE LOWER(e.entity_name) = LOWER(?)
430
+ GROUP BY e.memory_id
431
+ ORDER BY entity_count DESC
432
+ LIMIT ?
433
+ """, (entity_name, limit))
434
+
435
+ for row in cursor.fetchall():
436
+ memory_id = row["memory_id"]
437
+ if memory_id not in results:
438
+ results[memory_id] = {
439
+ "memory_id": memory_id,
440
+ "matched_entities": [],
441
+ "score": 0.0
442
+ }
443
+
444
+ results[memory_id]["matched_entities"].append({
445
+ "name": row["entity_name"],
446
+ "type": row["entity_type"]
447
+ })
448
+ results[memory_id]["score"] += 0.5 # Score base por entidade
449
+
450
+ conn.close()
451
+
452
+ # Normaliza scores
453
+ if results:
454
+ max_score = max(r["score"] for r in results.values())
455
+ for r in results.values():
456
+ r["score"] = r["score"] / max_score if max_score > 0 else 0
457
+
458
+ return list(results.values())
459
+
460
+ def _extract_query_entities(self, query: str) -> List[str]:
461
+ """
462
+ Extrai possíveis entidades de uma query.
463
+
464
+ Usa heurísticas simples (sem spaCy para evitar dependência aqui):
465
+ - Palavras capitalizadas
466
+ - Termos entre aspas
467
+ - Acrônimos
468
+
469
+ Args:
470
+ query: Texto de busca
471
+
472
+ Returns:
473
+ Lista de nomes de entidades candidatas
474
+ """
475
+ entities = set()
476
+
477
+ # Termos entre aspas
478
+ quoted = re.findall(r'"([^"]+)"', query)
479
+ entities.update(quoted)
480
+
481
+ # Palavras capitalizadas (prováveis nomes próprios)
482
+ capitalized = re.findall(r'\b[A-Z][a-zA-Z]*\b', query)
483
+ entities.update(capitalized)
484
+
485
+ # Acrônimos
486
+ acronyms = re.findall(r'\b[A-Z]{2,}\b', query)
487
+ entities.update(acronyms)
488
+
489
+ # Remove stop words e termos muito curtos
490
+ stop_words = {"A", "O", "Os", "As", "Um", "Uma", "Em", "De", "Do", "Da", "Com", "Por", "Para"}
491
+ entities = {e for e in entities if e not in stop_words and len(e) > 2}
492
+
493
+ return list(entities)
494
+
495
+ # ========================================================================
496
+ # MÉTODOS DE INTEGRAÇÃO COM FRONTMATTER
497
+ # ========================================================================
498
+
499
+ def extract_from_frontmatter(
500
+ self,
501
+ memory_id: str,
502
+ frontmatter: Dict[str, Any],
503
+ project: str
504
+ ) -> List[str]:
505
+ """
506
+ Extrai entidades do frontmatter de uma memória.
507
+
508
+ Args:
509
+ memory_id: ID da memória
510
+ frontmatter: Dicionário com metadados
511
+ project: Nome do projeto
512
+
513
+ Returns:
514
+ Lista de IDs de entidades criadas
515
+ """
516
+ entity_ids = []
517
+
518
+ # Type como entidade
519
+ if "type" in frontmatter:
520
+ eid = self.insert_entity(
521
+ memory_id,
522
+ f"TYPE:{frontmatter['type']}",
523
+ "META",
524
+ confidence=1.0
525
+ )
526
+ entity_ids.append(eid)
527
+
528
+ # Project como entidade
529
+ if project:
530
+ eid = self.insert_entity(
531
+ memory_id,
532
+ project,
533
+ "PROJECT",
534
+ confidence=1.0
535
+ )
536
+ entity_ids.append(eid)
537
+
538
+ # Tags como entidades
539
+ if "tags" in frontmatter:
540
+ tags = frontmatter.get("tags", "")
541
+ if isinstance(tags, str):
542
+ for tag in [t.strip() for t in tags.split(",") if t.strip()]:
543
+ eid = self.insert_entity(
544
+ memory_id,
545
+ f"TAG:{tag}",
546
+ "TAG",
547
+ confidence=1.0
548
+ )
549
+ entity_ids.append(eid)
550
+ elif isinstance(tags, list):
551
+ for tag in tags:
552
+ eid = self.insert_entity(
553
+ memory_id,
554
+ f"TAG:{tag}",
555
+ "TAG",
556
+ confidence=1.0
557
+ )
558
+ entity_ids.append(eid)
559
+
560
+ return entity_ids
561
+
562
+ def extract_from_content(
563
+ self,
564
+ memory_id: str,
565
+ content: str,
566
+ use_spacy: bool = True
567
+ ) -> List[str]:
568
+ """
569
+ Extrai entidades do conteúdo usando spaCy NER.
570
+
571
+ Args:
572
+ memory_id: ID da memória
573
+ content: Conteúdo de texto
574
+ use_spacy: Usar spaCy (padrão: True)
575
+
576
+ Returns:
577
+ Lista de IDs de entidades criadas
578
+ """
579
+ # Verifica cache - só processa se conteúdo mudou
580
+ cached_hash = self.get_cached_hash(memory_id)
581
+ current_hash = self._compute_hash(content)
582
+
583
+ if cached_hash == current_hash:
584
+ # Conteúdo igual, verifica se já tem entidades
585
+ existing = self.get_entities_by_memory(memory_id)
586
+ if existing:
587
+ return [] # Já processado, sem mudanças
588
+
589
+ # Conteúdo novo ou mudou - remove entidades antigas e reprocessa
590
+ self.delete_entities_by_memory(memory_id)
591
+
592
+ entity_ids = []
593
+
594
+ if use_spacy:
595
+ try:
596
+ import spacy
597
+ from spacy import Language
598
+
599
+ # Carrega modelo (download na primeira vez)
600
+ try:
601
+ nlp: Language = spacy.load("pt_core_news_sm")
602
+ except OSError:
603
+ # Modelo não instalado - tenta instalar
604
+ import subprocess
605
+ subprocess.run(
606
+ ["python", "-m", "spacy", "download", "pt_core_news_sm"],
607
+ capture_output=True
608
+ )
609
+ nlp = spacy.load("pt_core_news_sm")
610
+
611
+ # Processa texto
612
+ doc = nlp(content[:5000]) # Limita a 5000 chars para performance
613
+
614
+ # Mapeia labels do spaCy para nossos tipos
615
+ label_map = {
616
+ "ORG": "ORG",
617
+ "PERSON": "PERSON",
618
+ "GPE": "LOC",
619
+ "LOC": "LOC",
620
+ "PRODUCT": "PRODUCT",
621
+ "EVENT": "EVENT",
622
+ "WORK_OF_ART": "PRODUCT",
623
+ "LAW": "PRODUCT",
624
+ "LANGUAGE": "TECH",
625
+ }
626
+
627
+ for ent in doc.ents:
628
+ if ent.label_ in label_map:
629
+ entity_type = label_map[ent.label_]
630
+ confidence = float(ent._.get("nerd_score", 0.8)) if hasattr(ent._, "nerd_score") else 0.8
631
+
632
+ eid = self.insert_entity(
633
+ memory_id,
634
+ ent.text,
635
+ entity_type,
636
+ confidence=confidence,
637
+ span_start=ent.start_char,
638
+ span_end=ent.end_char,
639
+ context_snippet=content[max(0, ent.start_char - 25):ent.end_char + 25][:50]
640
+ )
641
+ entity_ids.append(eid)
642
+
643
+ # Atualiza cache após processamento
644
+ self.update_cache(memory_id, content, len(entity_ids))
645
+
646
+ except ImportError:
647
+ # spaCy não disponível - usa fallback simples
648
+ entity_ids.extend(self._extract_entities_fallback(memory_id, content))
649
+ # Atualiza cache após processamento
650
+ self.update_cache(memory_id, content, len(entity_ids))
651
+ except Exception:
652
+ # Falha silenciosa - não quebra o fluxo
653
+ pass
654
+ else:
655
+ entity_ids.extend(self._extract_entities_fallback(memory_id, content))
656
+ # Atualiza cache após processamento
657
+ self.update_cache(memory_id, content, len(entity_ids))
658
+
659
+ return entity_ids
660
+
661
+ def _extract_entities_fallback(
662
+ self,
663
+ memory_id: str,
664
+ content: str
665
+ ) -> List[str]:
666
+ """
667
+ Fallback sem spaCy - usa heurísticas simples.
668
+
669
+ Args:
670
+ memory_id: ID da memória
671
+ content: Conteúdo de texto
672
+
673
+ Returns:
674
+ Lista de IDs de entidades criadas
675
+ """
676
+ entity_ids = []
677
+
678
+ # Palavras capitalizadas (prováveis nomes próprios/ORGs)
679
+ capitalized = re.findall(r'\b[A-Z][a-zA-Z]{2,}\b', content[:2000])
680
+
681
+ # Filtra termos comuns que não são entidades
682
+ common_words = {
683
+ "The", "This", "That", "These", "Those", "What", "When", "Where", "Why", "How",
684
+ "Para", "Com", "Por", "Em", "De", "Do", "Da", "Dos", "Das", "Uma", "Um",
685
+ "Como", "Quando", "Onde", "Qual", "Quais", "Quem", "Sobre"
686
+ }
687
+
688
+ entities = set(e for e in capitalized if e not in common_words)
689
+
690
+ for entity in list(entities)[:20]: # Limita a 20 entidades
691
+ eid = self.insert_entity(
692
+ memory_id,
693
+ entity,
694
+ "ORG" if entity[0].isupper() else "PERSON",
695
+ confidence=0.5, # Baixa confiança sem spaCy
696
+ context_snippet=entity
697
+ )
698
+ entity_ids.append(eid)
699
+
700
+ return entity_ids
701
+
702
+ # ========================================================================
703
+ # CACHE DE HASH (para evitar reprocessamento)
704
+ # ========================================================================
705
+
706
+ def _compute_hash(self, content: str) -> str:
707
+ """
708
+ Computa hash do conteúdo.
709
+
710
+ Args:
711
+ content: Conteúdo de texto
712
+
713
+ Returns:
714
+ Hash SHA256 (primeiros 16 chars)
715
+ """
716
+ import hashlib
717
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
718
+
719
+ def get_cached_hash(self, memory_id: str) -> Optional[str]:
720
+ """
721
+ Obtém hash em cache de uma memória.
722
+
723
+ Args:
724
+ memory_id: ID da memória
725
+
726
+ Returns:
727
+ Hash armazenado ou None
728
+ """
729
+ conn = self._connect()
730
+ cursor = conn.execute(
731
+ "SELECT content_hash FROM entity_cache WHERE memory_id = ?",
732
+ (memory_id,)
733
+ )
734
+ row = cursor.fetchone()
735
+ conn.close()
736
+ return row["content_hash"] if row else None
737
+
738
+ def is_content_changed(self, memory_id: str, content: str) -> bool:
739
+ """
740
+ Verifica se conteúdo mudou desde último processamento.
741
+
742
+ Args:
743
+ memory_id: ID da memória
744
+ content: Conteúdo de texto
745
+
746
+ Returns:
747
+ True se mudou, False se igual
748
+ """
749
+ current_hash = self._compute_hash(content)
750
+ cached_hash = self.get_cached_hash(memory_id)
751
+ return cached_hash != current_hash
752
+
753
+ def update_cache(self, memory_id: str, content: str, entity_count: int) -> None:
754
+ """
755
+ Atualiza cache de hash após processamento.
756
+
757
+ Args:
758
+ memory_id: ID da memória
759
+ content: Conteúdo de texto
760
+ entity_count: Número de entidades extraídas
761
+ """
762
+ content_hash = self._compute_hash(content)
763
+
764
+ conn = self._connect()
765
+ conn.execute("""
766
+ INSERT OR REPLACE INTO entity_cache
767
+ (memory_id, content_hash, processed_at, entity_count)
768
+ VALUES (?, ?, datetime('now'), ?)
769
+ """, (memory_id, content_hash, entity_count))
770
+ conn.commit()
771
+ conn.close()
772
+
773
+ def clear_cache(self, memory_id: str) -> None:
774
+ """
775
+ Limpa cache de uma memória.
776
+
777
+ Args:
778
+ memory_id: ID da memória
779
+ """
780
+ conn = self._connect()
781
+ conn.execute(
782
+ "DELETE FROM entity_cache WHERE memory_id = ?",
783
+ (memory_id,)
784
+ )
785
+ conn.commit()
786
+ conn.close()
787
+
788
+ # ========================================================================
789
+ # ESTATÍSTICAS
790
+ # ========================================================================
791
+
792
+ def get_stats(self) -> Dict[str, Any]:
793
+ """
794
+ Obtém estatísticas do grafo.
795
+
796
+ Returns:
797
+ Dicionário com estatísticas
798
+ """
799
+ conn = self._connect()
800
+
801
+ total_entities = conn.execute(
802
+ "SELECT COUNT(*) FROM entities"
803
+ ).fetchone()[0]
804
+
805
+ total_relationships = conn.execute(
806
+ "SELECT COUNT(*) FROM entity_relationships"
807
+ ).fetchone()[0]
808
+
809
+ by_type = conn.execute(
810
+ "SELECT entity_type, COUNT(*) FROM entities GROUP BY entity_type"
811
+ ).fetchall()
812
+
813
+ conn.close()
814
+
815
+ return {
816
+ "total_entities": total_entities,
817
+ "total_relationships": total_relationships,
818
+ "by_type": dict(by_type),
819
+ "avg_relationships_per_entity": (
820
+ total_relationships / total_entities if total_entities > 0 else 0
821
+ )
822
+ }
@@ -6,6 +6,7 @@ from dataclasses import dataclass
6
6
 
7
7
  from .metadata_db import MetadataDB
8
8
  from .embeddings_db import EmbeddingsDB
9
+ from .entities_db import EntitiesDB
9
10
 
10
11
 
11
12
  @dataclass
@@ -16,7 +17,7 @@ class QueryResult:
16
17
  project: str
17
18
  title: str
18
19
  score: float
19
- source: str # 'fts', 'semantic', 'metadata'
20
+ source: str # 'fts', 'semantic', 'metadata', 'graph'
20
21
  metadata: Dict[str, Any] = None
21
22
 
22
23
 
@@ -24,16 +25,18 @@ class QueryEngine:
24
25
  """
25
26
  Engine de consultas híbridas.
26
27
 
27
- Combina três tipos de busca:
28
+ Combina quatro tipos de busca:
28
29
  - Metadata: filtros estruturados (projeto, tipo, tags)
29
30
  - FTS: busca full-text no conteúdo
30
31
  - Semantic: busca por similaridade de embeddings
32
+ - Graph: busca por entidades e relacionamentos
31
33
  """
32
34
 
33
35
  def __init__(
34
36
  self,
35
37
  metadata_db: MetadataDB,
36
- embeddings_db: EmbeddingsDB
38
+ embeddings_db: EmbeddingsDB,
39
+ entities_db: Optional[EntitiesDB] = None
37
40
  ):
38
41
  """
39
42
  Inicializa o QueryEngine.
@@ -41,9 +44,11 @@ class QueryEngine:
41
44
  Args:
42
45
  metadata_db: Instância do MetadataDB
43
46
  embeddings_db: Instância do EmbeddingsDB
47
+ entities_db: Instância do EntitiesDB (opcional)
44
48
  """
45
49
  self.metadata_db = metadata_db
46
50
  self.embeddings_db = embeddings_db
51
+ self.entities_db = entities_db
47
52
 
48
53
  def search(
49
54
  self,
@@ -53,11 +58,13 @@ class QueryEngine:
53
58
  limit: int = 10,
54
59
  use_fts: bool = True,
55
60
  use_semantic: bool = True,
56
- fts_weight: float = 0.4,
57
- semantic_weight: float = 0.6
61
+ use_graph: bool = True,
62
+ fts_weight: float = 0.3,
63
+ semantic_weight: float = 0.5,
64
+ graph_weight: float = 0.2
58
65
  ) -> List[QueryResult]:
59
66
  """
60
- Busca híbrida combinando FTS e semantic.
67
+ Busca híbrida combinando FTS, semantic e graph.
61
68
 
62
69
  Args:
63
70
  query: Texto de busca
@@ -66,8 +73,10 @@ class QueryEngine:
66
73
  limit: Limite de resultados
67
74
  use_fts: Usar busca FTS
68
75
  use_semantic: Usar busca semantic
76
+ use_graph: Usar busca por graph
69
77
  fts_weight: Peso da busca FTS
70
78
  semantic_weight: Peso da busca semantic
79
+ graph_weight: Peso da busca graph
71
80
 
72
81
  Returns:
73
82
  Lista de resultados ordenados por relevância
@@ -78,6 +87,7 @@ class QueryEngine:
78
87
  if use_fts:
79
88
  fts_results = self._search_fts(query, project, mem_type, limit)
80
89
  for r in fts_results:
90
+ r.score *= fts_weight # Aplica peso FTS desde o início
81
91
  results[r.memory_id] = r
82
92
 
83
93
  # Busca Semantic
@@ -85,12 +95,9 @@ class QueryEngine:
85
95
  semantic_results = self._search_semantic(query, project, limit)
86
96
  for r in semantic_results:
87
97
  if r.memory_id in results:
88
- # Combina scores
98
+ # Combina scores: média ponderada (FTS já tem peso aplicado)
89
99
  existing = results[r.memory_id]
90
- combined_score = (
91
- existing.score * fts_weight +
92
- r.score * semantic_weight
93
- )
100
+ combined_score = existing.score + (r.score * semantic_weight)
94
101
  results[r.memory_id] = QueryResult(
95
102
  memory_id=r.memory_id,
96
103
  type=r.type,
@@ -104,6 +111,27 @@ class QueryEngine:
104
111
  r.score *= semantic_weight
105
112
  results[r.memory_id] = r
106
113
 
114
+ # Busca por Graph (entidades)
115
+ if use_graph and self.entities_db:
116
+ graph_results = self._search_by_graph(query, limit)
117
+ for r in graph_results:
118
+ if r.memory_id in results:
119
+ # Combina scores: soma ponderada (scores anteriores já têm peso)
120
+ existing = results[r.memory_id]
121
+ combined_score = existing.score + (r.score * graph_weight)
122
+ results[r.memory_id] = QueryResult(
123
+ memory_id=r.memory_id,
124
+ type=r.type,
125
+ project=r.project,
126
+ title=r.title,
127
+ score=combined_score,
128
+ source="hybrid",
129
+ metadata=r.metadata
130
+ )
131
+ else:
132
+ r.score *= graph_weight
133
+ results[r.memory_id] = r
134
+
107
135
  # Filtra por tipo se especificado
108
136
  if mem_type:
109
137
  results = {
@@ -198,6 +226,48 @@ class QueryEngine:
198
226
 
199
227
  return results
200
228
 
229
+ def _search_by_graph(
230
+ self,
231
+ query: str,
232
+ limit: int
233
+ ) -> List[QueryResult]:
234
+ """
235
+ Busca por entidades no grafo.
236
+
237
+ Extrai entidades da query e retorna memórias conectadas.
238
+
239
+ Args:
240
+ query: Texto de busca
241
+ limit: Limite de resultados
242
+
243
+ Returns:
244
+ Lista de resultados com score de grafo
245
+ """
246
+ if not self.entities_db:
247
+ return []
248
+
249
+ graph_results = self.entities_db.search_by_query(query, limit * 2)
250
+
251
+ results = []
252
+ for item in graph_results:
253
+ # Busca metadados adicionais
254
+ memory = self.metadata_db.get_by_id(item["memory_id"])
255
+
256
+ results.append(QueryResult(
257
+ memory_id=item["memory_id"],
258
+ type=memory.get("type", "unknown") if memory else "unknown",
259
+ project=memory.get("project", "unknown") if memory else "unknown",
260
+ title=memory.get("title", item["memory_id"]) if memory else item["memory_id"],
261
+ score=item["score"],
262
+ source="graph",
263
+ metadata={
264
+ "matched_entities": item.get("matched_entities", []),
265
+ "graph_score": item["score"]
266
+ }
267
+ ))
268
+
269
+ return results
270
+
201
271
  def search_by_metadata(
202
272
  self,
203
273
  project: Optional[str] = None,
package/src/mcp/server.py CHANGED
@@ -26,6 +26,7 @@ from src.consolidation.extractor import Extractor
26
26
  from src.consolidation.promoter import Promoter
27
27
  from src.index.metadata_db import MetadataDB
28
28
  from src.index.embeddings_db import EmbeddingsDB
29
+ from src.index.entities_db import EntitiesDB
29
30
  from src.index.queries import QueryEngine
30
31
  from src.hooks.custom_loader import HooksLoader, HookRunner
31
32
  from src.diff.memory_diff import MemoryDiff
@@ -81,10 +82,15 @@ class CerebroMCP:
81
82
 
82
83
  self.metadata_db = MetadataDB(self.cerebro_path / "index" / "metadata.db")
83
84
  self.embeddings_db = EmbeddingsDB(self.cerebro_path / "index" / "embeddings.db")
84
- self.query_engine = QueryEngine(self.metadata_db, self.embeddings_db)
85
+ self.entities_db = EntitiesDB(self.cerebro_path / "index" / "entities.db")
86
+ self.query_engine = QueryEngine(self.metadata_db, self.embeddings_db, self.entities_db)
85
87
 
86
88
  self.extractor = Extractor(self.raw_storage, self.working_storage)
87
- self.promoter = Promoter(self.working_storage, self.official_storage)
89
+ self.promoter = Promoter(
90
+ self.working_storage,
91
+ self.official_storage,
92
+ self.cerebro_path / "index" / "entities.db"
93
+ )
88
94
 
89
95
  self.memory_view = MemoryView(
90
96
  self.cerebro_path,
@@ -340,6 +346,31 @@ class CerebroMCP:
340
346
  }
341
347
  }
342
348
  }
349
+ ),
350
+ Tool(
351
+ name="cerebro_graph",
352
+ description="Explora grafo de entidades - mostra conexões entre projetos, tecnologias, pessoas e decisões",
353
+ inputSchema={
354
+ "type": "object",
355
+ "properties": {
356
+ "entity": {
357
+ "type": "string",
358
+ "description": "Nome da entidade para iniciar traversal (ex: 'MedicsPro', 'JWT', 'autenticação')"
359
+ },
360
+ "depth": {
361
+ "type": "integer",
362
+ "description": "Profundidade máxima do traversal (1-3, padrão: 2)",
363
+ "default": 2
364
+ },
365
+ "types": {
366
+ "type": "array",
367
+ "items": {"type": "string"},
368
+ "description": "Filtrar por tipos de entidade (ex: ['ORG', 'TECH'])",
369
+ "default": ["ORG", "TECH", "PERSON", "PROJECT"]
370
+ }
371
+ },
372
+ "required": ["entity"]
373
+ }
343
374
  )
344
375
  ]
345
376
 
@@ -377,6 +408,8 @@ class CerebroMCP:
377
408
  result = self._gc(arguments)
378
409
  elif name == "cerebro_capture_memory":
379
410
  result = self._capture_memory(arguments)
411
+ elif name == "cerebro_graph":
412
+ result = self._cerebro_graph(arguments)
380
413
  else:
381
414
  return [TextContent(type="text", text=f"Ferramenta desconhecida: {name}")]
382
415
 
@@ -499,7 +532,8 @@ class CerebroMCP:
499
532
  "",
500
533
  "Índice:",
501
534
  f" Metadata DB: {self.cerebro_path / 'index' / 'metadata.db'}",
502
- f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}"
535
+ f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}",
536
+ f" Entities DB: {self.cerebro_path / 'index' / 'entities.db'}"
503
537
  ]
504
538
 
505
539
  return "\n".join(lines)
@@ -679,8 +713,14 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
679
713
 
680
714
  desc_match = re.search(r'description:\s*(.*)', content)
681
715
  type_match = re.search(r'type:\s*(.*)', content)
716
+ project_match = re.search(r'project:\s*(.*)', content)
717
+ tags_match = re.search(r'tags:\s*(.*)', content)
718
+
682
719
  desc = desc_match.group(1).strip() if desc_match else "sem descrição"
683
720
  m_type = type_match.group(1).strip() if type_match else "project"
721
+ project = project_match.group(1).strip() if project_match else "unknown"
722
+ tags = tags_match.group(1).strip() if tags_match else ""
723
+
684
724
  ts = datetime.now().strftime("%Y-%m-%d")
685
725
  entry = f"- [{m_type}] {mem_name}.md ({ts}): {desc}\n"
686
726
 
@@ -693,6 +733,29 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
693
733
  else:
694
734
  index_path.write_text(f"# Memórias do Projeto\n\n{entry}", encoding="utf-8")
695
735
 
736
+ # BUG FIX: Registrar entidades no grafo (frontmatter + conteúdo)
737
+ if self.entities_db:
738
+ import yaml
739
+ frontmatter_match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
740
+ if frontmatter_match:
741
+ try:
742
+ frontmatter = yaml.safe_load(frontmatter_match.group(1))
743
+ # Extrai entidades do frontmatter
744
+ self.entities_db.extract_from_frontmatter(
745
+ memory_id=mem_name,
746
+ frontmatter=frontmatter or {},
747
+ project=project
748
+ )
749
+ # Extrai entidades do conteúdo (spaCy NER)
750
+ body_content = frontmatter_match.group(2)
751
+ self.entities_db.extract_from_content(
752
+ memory_id=mem_name,
753
+ content=body_content,
754
+ use_spacy=True
755
+ )
756
+ except Exception as e:
757
+ pass # Falha silenciosa se frontmatter inválido
758
+
696
759
  return f"✅ Memória '{mem_name}' salva em {file_path}"
697
760
 
698
761
  def _remember(self, args: Dict[str, Any]) -> str:
@@ -717,6 +780,86 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
717
780
  )
718
781
  return gc.generate_gc_report(results)
719
782
 
783
+ def _cerebro_graph(self, args: Dict[str, Any]) -> str:
784
+ """Explora grafo de entidades"""
785
+ entity = args.get("entity")
786
+ if not entity:
787
+ return "Erro: 'entity' é obrigatório para cerebro_graph"
788
+
789
+ depth = args.get("depth", 2)
790
+ entity_types = args.get("types", ["ORG", "TECH", "PERSON", "PROJECT"])
791
+
792
+ # Limita profundidade máxima para evitar traversal muito grande
793
+ depth = min(depth, 3)
794
+
795
+ nodes, edges = self.entities_db.traverse(
796
+ start_entity=entity,
797
+ depth=depth,
798
+ entity_types=entity_types,
799
+ max_nodes=50
800
+ )
801
+
802
+ if not nodes:
803
+ return f"Nenhuma entidade encontrada para '{entity}'"
804
+
805
+ # Formata grafo como árvore
806
+ return self._format_graph(nodes, edges, entity)
807
+
808
+ def _format_graph(
809
+ self,
810
+ nodes: List[Dict[str, Any]],
811
+ edges: List[Dict[str, Any]],
812
+ root_entity: str
813
+ ) -> str:
814
+ """Formata grafo como árvore visual"""
815
+ lines = [f"## Grafo de '{root_entity}'\n"]
816
+ lines.append(f"**{len(nodes)}** entidades encontradas, **{len(edges)}** conexões\n")
817
+
818
+ # Constroi adjacency list
819
+ adj: Dict[str, List[Dict[str, Any]]] = {}
820
+ for edge in edges:
821
+ source = edge["source"].lower()
822
+ if source not in adj:
823
+ adj[source] = []
824
+ adj[source].append(edge)
825
+
826
+ # BFS para imprimir árvore
827
+ visited = set()
828
+ queue = [(root_entity.lower(), 0)]
829
+
830
+ while queue:
831
+ entity_name, depth = queue.pop(0)
832
+
833
+ if entity_name in visited:
834
+ continue
835
+ visited.add(entity_name)
836
+
837
+ # Encontra nó correspondente
838
+ node = next((n for n in nodes if n["name"].lower() == entity_name), None)
839
+ if not node:
840
+ continue
841
+
842
+ # Imprime nó
843
+ prefix = " " * depth
844
+ connector = "├─ " if depth > 0 else ""
845
+ lines.append(f"{prefix}{connector}{node['name']} ({node['type']})")
846
+
847
+ # Adiciona filhos na fila
848
+ if depth < 3:
849
+ children = adj.get(entity_name, [])
850
+ for child in children:
851
+ child_name = child["target"].lower()
852
+ if child_name not in visited:
853
+ queue.append((child_name, depth + 1))
854
+
855
+ # Lista todas as arestas
856
+ if edges:
857
+ lines.append("\n## Conexões")
858
+ for edge in edges:
859
+ lines.append(f"- {edge['source']} → {edge['target']} ({edge['type']})")
860
+
861
+ return "\n".join(lines)
862
+
720
863
 
721
864
  async def main():
722
865
  """Entry point do MCP Server"""