ocerebro 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cerebro/index/embeddings.db +0 -0
- package/cerebro/index/entities.db +0 -0
- package/cerebro/index/metadata.db +0 -0
- package/package.json +1 -1
- package/pyproject.toml +2 -1
- package/src/consolidation/promoter.py +28 -2
- package/src/index/entities_db.py +241 -0
- package/src/index/queries.py +5 -10
- package/src/mcp/server.py +31 -1
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
package/pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ocerebro"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -37,6 +37,7 @@ dependencies = [
|
|
|
37
37
|
"sentence-transformers>=2.2.0",
|
|
38
38
|
"mcp>=1.0.0",
|
|
39
39
|
"anthropic>=0.40.0",
|
|
40
|
+
"spacy>=3.5.0",
|
|
40
41
|
]
|
|
41
42
|
|
|
42
43
|
[project.optional-dependencies]
|
|
@@ -227,8 +227,9 @@ class Promoter:
|
|
|
227
227
|
content=content
|
|
228
228
|
)
|
|
229
229
|
|
|
230
|
-
# Extrai entidades do frontmatter
|
|
230
|
+
# Extrai entidades do frontmatter E do conteúdo
|
|
231
231
|
self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
|
|
232
|
+
self._extract_entities_from_content(draft_id, content)
|
|
232
233
|
|
|
233
234
|
return PromotionResult(
|
|
234
235
|
success=True,
|
|
@@ -307,8 +308,9 @@ class Promoter:
|
|
|
307
308
|
content=body
|
|
308
309
|
)
|
|
309
310
|
|
|
310
|
-
# Extrai entidades do frontmatter
|
|
311
|
+
# Extrai entidades do frontmatter E do conteúdo
|
|
311
312
|
self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
|
|
313
|
+
self._extract_entities_from_content(draft_id, body)
|
|
312
314
|
|
|
313
315
|
return PromotionResult(
|
|
314
316
|
success=True,
|
|
@@ -477,3 +479,27 @@ class Promoter:
|
|
|
477
479
|
frontmatter=frontmatter,
|
|
478
480
|
project=project
|
|
479
481
|
)
|
|
482
|
+
|
|
483
|
+
def _extract_entities_from_content(
|
|
484
|
+
self,
|
|
485
|
+
memory_id: str,
|
|
486
|
+
content: str
|
|
487
|
+
) -> List[str]:
|
|
488
|
+
"""
|
|
489
|
+
Extrai entidades do conteúdo usando spaCy NER e registra no grafo.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
memory_id: ID da memória
|
|
493
|
+
content: Conteúdo de texto
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Lista de IDs de entidades criadas
|
|
497
|
+
"""
|
|
498
|
+
if not self.entities_db:
|
|
499
|
+
return []
|
|
500
|
+
|
|
501
|
+
return self.entities_db.extract_from_content(
|
|
502
|
+
memory_id=memory_id,
|
|
503
|
+
content=content,
|
|
504
|
+
use_spacy=True
|
|
505
|
+
)
|
package/src/index/entities_db.py
CHANGED
|
@@ -75,6 +75,21 @@ class EntitiesDB:
|
|
|
75
75
|
ON entities(memory_id)
|
|
76
76
|
""")
|
|
77
77
|
|
|
78
|
+
# Tabela de cache de hash (para evitar reprocessamento)
|
|
79
|
+
conn.execute("""
|
|
80
|
+
CREATE TABLE IF NOT EXISTS entity_cache (
|
|
81
|
+
memory_id TEXT PRIMARY KEY,
|
|
82
|
+
content_hash TEXT,
|
|
83
|
+
processed_at TEXT DEFAULT (datetime('now')),
|
|
84
|
+
entity_count INTEGER DEFAULT 0
|
|
85
|
+
)
|
|
86
|
+
""")
|
|
87
|
+
|
|
88
|
+
conn.execute("""
|
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_entity_cache_hash
|
|
90
|
+
ON entity_cache(content_hash)
|
|
91
|
+
""")
|
|
92
|
+
|
|
78
93
|
# Tabela de relacionamentos
|
|
79
94
|
conn.execute("""
|
|
80
95
|
CREATE TABLE IF NOT EXISTS entity_relationships (
|
|
@@ -544,6 +559,232 @@ class EntitiesDB:
|
|
|
544
559
|
|
|
545
560
|
return entity_ids
|
|
546
561
|
|
|
562
|
+
def extract_from_content(
|
|
563
|
+
self,
|
|
564
|
+
memory_id: str,
|
|
565
|
+
content: str,
|
|
566
|
+
use_spacy: bool = True
|
|
567
|
+
) -> List[str]:
|
|
568
|
+
"""
|
|
569
|
+
Extrai entidades do conteúdo usando spaCy NER.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
memory_id: ID da memória
|
|
573
|
+
content: Conteúdo de texto
|
|
574
|
+
use_spacy: Usar spaCy (padrão: True)
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
Lista de IDs de entidades criadas
|
|
578
|
+
"""
|
|
579
|
+
# Verifica cache - só processa se conteúdo mudou
|
|
580
|
+
cached_hash = self.get_cached_hash(memory_id)
|
|
581
|
+
current_hash = self._compute_hash(content)
|
|
582
|
+
|
|
583
|
+
if cached_hash == current_hash:
|
|
584
|
+
# Conteúdo igual, verifica se já tem entidades
|
|
585
|
+
existing = self.get_entities_by_memory(memory_id)
|
|
586
|
+
if existing:
|
|
587
|
+
return [] # Já processado, sem mudanças
|
|
588
|
+
|
|
589
|
+
# Conteúdo novo ou mudou - remove entidades antigas e reprocessa
|
|
590
|
+
self.delete_entities_by_memory(memory_id)
|
|
591
|
+
|
|
592
|
+
entity_ids = []
|
|
593
|
+
|
|
594
|
+
if use_spacy:
|
|
595
|
+
try:
|
|
596
|
+
import spacy
|
|
597
|
+
from spacy import Language
|
|
598
|
+
|
|
599
|
+
# Carrega modelo (download na primeira vez)
|
|
600
|
+
try:
|
|
601
|
+
nlp: Language = spacy.load("pt_core_news_sm")
|
|
602
|
+
except OSError:
|
|
603
|
+
# Modelo não instalado - tenta instalar
|
|
604
|
+
import subprocess
|
|
605
|
+
subprocess.run(
|
|
606
|
+
["python", "-m", "spacy", "download", "pt_core_news_sm"],
|
|
607
|
+
capture_output=True
|
|
608
|
+
)
|
|
609
|
+
nlp = spacy.load("pt_core_news_sm")
|
|
610
|
+
|
|
611
|
+
# Processa texto
|
|
612
|
+
doc = nlp(content[:5000]) # Limita a 5000 chars para performance
|
|
613
|
+
|
|
614
|
+
# Mapeia labels do spaCy para nossos tipos
|
|
615
|
+
label_map = {
|
|
616
|
+
"ORG": "ORG",
|
|
617
|
+
"PERSON": "PERSON",
|
|
618
|
+
"GPE": "LOC",
|
|
619
|
+
"LOC": "LOC",
|
|
620
|
+
"PRODUCT": "PRODUCT",
|
|
621
|
+
"EVENT": "EVENT",
|
|
622
|
+
"WORK_OF_ART": "PRODUCT",
|
|
623
|
+
"LAW": "PRODUCT",
|
|
624
|
+
"LANGUAGE": "TECH",
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
for ent in doc.ents:
|
|
628
|
+
if ent.label_ in label_map:
|
|
629
|
+
entity_type = label_map[ent.label_]
|
|
630
|
+
confidence = float(ent._.get("nerd_score", 0.8)) if hasattr(ent._, "nerd_score") else 0.8
|
|
631
|
+
|
|
632
|
+
eid = self.insert_entity(
|
|
633
|
+
memory_id,
|
|
634
|
+
ent.text,
|
|
635
|
+
entity_type,
|
|
636
|
+
confidence=confidence,
|
|
637
|
+
span_start=ent.start_char,
|
|
638
|
+
span_end=ent.end_char,
|
|
639
|
+
context_snippet=content[max(0, ent.start_char - 25):ent.end_char + 25][:50]
|
|
640
|
+
)
|
|
641
|
+
entity_ids.append(eid)
|
|
642
|
+
|
|
643
|
+
# Atualiza cache após processamento
|
|
644
|
+
self.update_cache(memory_id, content, len(entity_ids))
|
|
645
|
+
|
|
646
|
+
except ImportError:
|
|
647
|
+
# spaCy não disponível - usa fallback simples
|
|
648
|
+
entity_ids.extend(self._extract_entities_fallback(memory_id, content))
|
|
649
|
+
# Atualiza cache após processamento
|
|
650
|
+
self.update_cache(memory_id, content, len(entity_ids))
|
|
651
|
+
except Exception:
|
|
652
|
+
# Falha silenciosa - não quebra o fluxo
|
|
653
|
+
pass
|
|
654
|
+
else:
|
|
655
|
+
entity_ids.extend(self._extract_entities_fallback(memory_id, content))
|
|
656
|
+
# Atualiza cache após processamento
|
|
657
|
+
self.update_cache(memory_id, content, len(entity_ids))
|
|
658
|
+
|
|
659
|
+
return entity_ids
|
|
660
|
+
|
|
661
|
+
def _extract_entities_fallback(
|
|
662
|
+
self,
|
|
663
|
+
memory_id: str,
|
|
664
|
+
content: str
|
|
665
|
+
) -> List[str]:
|
|
666
|
+
"""
|
|
667
|
+
Fallback sem spaCy - usa heurísticas simples.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
memory_id: ID da memória
|
|
671
|
+
content: Conteúdo de texto
|
|
672
|
+
|
|
673
|
+
Returns:
|
|
674
|
+
Lista de IDs de entidades criadas
|
|
675
|
+
"""
|
|
676
|
+
entity_ids = []
|
|
677
|
+
|
|
678
|
+
# Palavras capitalizadas (prováveis nomes próprios/ORGs)
|
|
679
|
+
capitalized = re.findall(r'\b[A-Z][a-zA-Z]{2,}\b', content[:2000])
|
|
680
|
+
|
|
681
|
+
# Filtra termos comuns que não são entidades
|
|
682
|
+
common_words = {
|
|
683
|
+
"The", "This", "That", "These", "Those", "What", "When", "Where", "Why", "How",
|
|
684
|
+
"Para", "Com", "Por", "Em", "De", "Do", "Da", "Dos", "Das", "Uma", "Um",
|
|
685
|
+
"Como", "Quando", "Onde", "Qual", "Quais", "Quem", "Sobre"
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
entities = set(e for e in capitalized if e not in common_words)
|
|
689
|
+
|
|
690
|
+
for entity in list(entities)[:20]: # Limita a 20 entidades
|
|
691
|
+
eid = self.insert_entity(
|
|
692
|
+
memory_id,
|
|
693
|
+
entity,
|
|
694
|
+
"ORG" if entity[0].isupper() else "PERSON",
|
|
695
|
+
confidence=0.5, # Baixa confiança sem spaCy
|
|
696
|
+
context_snippet=entity
|
|
697
|
+
)
|
|
698
|
+
entity_ids.append(eid)
|
|
699
|
+
|
|
700
|
+
return entity_ids
|
|
701
|
+
|
|
702
|
+
# ========================================================================
|
|
703
|
+
# CACHE DE HASH (para evitar reprocessamento)
|
|
704
|
+
# ========================================================================
|
|
705
|
+
|
|
706
|
+
def _compute_hash(self, content: str) -> str:
|
|
707
|
+
"""
|
|
708
|
+
Computa hash do conteúdo.
|
|
709
|
+
|
|
710
|
+
Args:
|
|
711
|
+
content: Conteúdo de texto
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
Hash SHA256 (primeiros 16 chars)
|
|
715
|
+
"""
|
|
716
|
+
import hashlib
|
|
717
|
+
return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
|
|
718
|
+
|
|
719
|
+
def get_cached_hash(self, memory_id: str) -> Optional[str]:
|
|
720
|
+
"""
|
|
721
|
+
Obtém hash em cache de uma memória.
|
|
722
|
+
|
|
723
|
+
Args:
|
|
724
|
+
memory_id: ID da memória
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
Hash armazenado ou None
|
|
728
|
+
"""
|
|
729
|
+
conn = self._connect()
|
|
730
|
+
cursor = conn.execute(
|
|
731
|
+
"SELECT content_hash FROM entity_cache WHERE memory_id = ?",
|
|
732
|
+
(memory_id,)
|
|
733
|
+
)
|
|
734
|
+
row = cursor.fetchone()
|
|
735
|
+
conn.close()
|
|
736
|
+
return row["content_hash"] if row else None
|
|
737
|
+
|
|
738
|
+
def is_content_changed(self, memory_id: str, content: str) -> bool:
|
|
739
|
+
"""
|
|
740
|
+
Verifica se conteúdo mudou desde último processamento.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
memory_id: ID da memória
|
|
744
|
+
content: Conteúdo de texto
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
True se mudou, False se igual
|
|
748
|
+
"""
|
|
749
|
+
current_hash = self._compute_hash(content)
|
|
750
|
+
cached_hash = self.get_cached_hash(memory_id)
|
|
751
|
+
return cached_hash != current_hash
|
|
752
|
+
|
|
753
|
+
def update_cache(self, memory_id: str, content: str, entity_count: int) -> None:
|
|
754
|
+
"""
|
|
755
|
+
Atualiza cache de hash após processamento.
|
|
756
|
+
|
|
757
|
+
Args:
|
|
758
|
+
memory_id: ID da memória
|
|
759
|
+
content: Conteúdo de texto
|
|
760
|
+
entity_count: Número de entidades extraídas
|
|
761
|
+
"""
|
|
762
|
+
content_hash = self._compute_hash(content)
|
|
763
|
+
|
|
764
|
+
conn = self._connect()
|
|
765
|
+
conn.execute("""
|
|
766
|
+
INSERT OR REPLACE INTO entity_cache
|
|
767
|
+
(memory_id, content_hash, processed_at, entity_count)
|
|
768
|
+
VALUES (?, ?, datetime('now'), ?)
|
|
769
|
+
""", (memory_id, content_hash, entity_count))
|
|
770
|
+
conn.commit()
|
|
771
|
+
conn.close()
|
|
772
|
+
|
|
773
|
+
def clear_cache(self, memory_id: str) -> None:
|
|
774
|
+
"""
|
|
775
|
+
Limpa cache de uma memória.
|
|
776
|
+
|
|
777
|
+
Args:
|
|
778
|
+
memory_id: ID da memória
|
|
779
|
+
"""
|
|
780
|
+
conn = self._connect()
|
|
781
|
+
conn.execute(
|
|
782
|
+
"DELETE FROM entity_cache WHERE memory_id = ?",
|
|
783
|
+
(memory_id,)
|
|
784
|
+
)
|
|
785
|
+
conn.commit()
|
|
786
|
+
conn.close()
|
|
787
|
+
|
|
547
788
|
# ========================================================================
|
|
548
789
|
# ESTATÍSTICAS
|
|
549
790
|
# ========================================================================
|
package/src/index/queries.py
CHANGED
|
@@ -87,6 +87,7 @@ class QueryEngine:
|
|
|
87
87
|
if use_fts:
|
|
88
88
|
fts_results = self._search_fts(query, project, mem_type, limit)
|
|
89
89
|
for r in fts_results:
|
|
90
|
+
r.score *= fts_weight # Aplica peso FTS desde o início
|
|
90
91
|
results[r.memory_id] = r
|
|
91
92
|
|
|
92
93
|
# Busca Semantic
|
|
@@ -94,12 +95,9 @@ class QueryEngine:
|
|
|
94
95
|
semantic_results = self._search_semantic(query, project, limit)
|
|
95
96
|
for r in semantic_results:
|
|
96
97
|
if r.memory_id in results:
|
|
97
|
-
# Combina scores
|
|
98
|
+
# Combina scores: média ponderada (FTS já tem peso aplicado)
|
|
98
99
|
existing = results[r.memory_id]
|
|
99
|
-
combined_score = (
|
|
100
|
-
existing.score * fts_weight +
|
|
101
|
-
r.score * semantic_weight
|
|
102
|
-
)
|
|
100
|
+
combined_score = existing.score + (r.score * semantic_weight)
|
|
103
101
|
results[r.memory_id] = QueryResult(
|
|
104
102
|
memory_id=r.memory_id,
|
|
105
103
|
type=r.type,
|
|
@@ -118,12 +116,9 @@ class QueryEngine:
|
|
|
118
116
|
graph_results = self._search_by_graph(query, limit)
|
|
119
117
|
for r in graph_results:
|
|
120
118
|
if r.memory_id in results:
|
|
121
|
-
# Combina scores
|
|
119
|
+
# Combina scores: soma ponderada (scores anteriores já têm peso)
|
|
122
120
|
existing = results[r.memory_id]
|
|
123
|
-
combined_score = (
|
|
124
|
-
existing.score * (1 - graph_weight) +
|
|
125
|
-
r.score * graph_weight
|
|
126
|
-
)
|
|
121
|
+
combined_score = existing.score + (r.score * graph_weight)
|
|
127
122
|
results[r.memory_id] = QueryResult(
|
|
128
123
|
memory_id=r.memory_id,
|
|
129
124
|
type=r.type,
|
package/src/mcp/server.py
CHANGED
|
@@ -532,7 +532,8 @@ class CerebroMCP:
|
|
|
532
532
|
"",
|
|
533
533
|
"Índice:",
|
|
534
534
|
f" Metadata DB: {self.cerebro_path / 'index' / 'metadata.db'}",
|
|
535
|
-
f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}"
|
|
535
|
+
f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}",
|
|
536
|
+
f" Entities DB: {self.cerebro_path / 'index' / 'entities.db'}"
|
|
536
537
|
]
|
|
537
538
|
|
|
538
539
|
return "\n".join(lines)
|
|
@@ -712,8 +713,14 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
|
|
|
712
713
|
|
|
713
714
|
desc_match = re.search(r'description:\s*(.*)', content)
|
|
714
715
|
type_match = re.search(r'type:\s*(.*)', content)
|
|
716
|
+
project_match = re.search(r'project:\s*(.*)', content)
|
|
717
|
+
tags_match = re.search(r'tags:\s*(.*)', content)
|
|
718
|
+
|
|
715
719
|
desc = desc_match.group(1).strip() if desc_match else "sem descrição"
|
|
716
720
|
m_type = type_match.group(1).strip() if type_match else "project"
|
|
721
|
+
project = project_match.group(1).strip() if project_match else "unknown"
|
|
722
|
+
tags = tags_match.group(1).strip() if tags_match else ""
|
|
723
|
+
|
|
717
724
|
ts = datetime.now().strftime("%Y-%m-%d")
|
|
718
725
|
entry = f"- [{m_type}] {mem_name}.md ({ts}): {desc}\n"
|
|
719
726
|
|
|
@@ -726,6 +733,29 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
|
|
|
726
733
|
else:
|
|
727
734
|
index_path.write_text(f"# Memórias do Projeto\n\n{entry}", encoding="utf-8")
|
|
728
735
|
|
|
736
|
+
# BUG FIX: Registrar entidades no grafo (frontmatter + conteúdo)
|
|
737
|
+
if self.entities_db:
|
|
738
|
+
import yaml
|
|
739
|
+
frontmatter_match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
|
|
740
|
+
if frontmatter_match:
|
|
741
|
+
try:
|
|
742
|
+
frontmatter = yaml.safe_load(frontmatter_match.group(1))
|
|
743
|
+
# Extrai entidades do frontmatter
|
|
744
|
+
self.entities_db.extract_from_frontmatter(
|
|
745
|
+
memory_id=mem_name,
|
|
746
|
+
frontmatter=frontmatter or {},
|
|
747
|
+
project=project
|
|
748
|
+
)
|
|
749
|
+
# Extrai entidades do conteúdo (spaCy NER)
|
|
750
|
+
body_content = frontmatter_match.group(2)
|
|
751
|
+
self.entities_db.extract_from_content(
|
|
752
|
+
memory_id=mem_name,
|
|
753
|
+
content=body_content,
|
|
754
|
+
use_spacy=True
|
|
755
|
+
)
|
|
756
|
+
except Exception as e:
|
|
757
|
+
pass # Falha silenciosa se frontmatter inválido
|
|
758
|
+
|
|
729
759
|
return f"✅ Memória '{mem_name}' salva em {file_path}"
|
|
730
760
|
|
|
731
761
|
def _remember(self, args: Dict[str, Any]) -> str:
|