ocerebro 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ocerebro",
3
- "version": "0.3.0",
3
+ "version": "0.3.2",
4
4
  "description": "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)",
5
5
  "main": "bin/ocerebro.js",
6
6
  "bin": {
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ocerebro"
7
- version = "0.3.0"
7
+ version = "0.3.2"
8
8
  description = "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -37,6 +37,7 @@ dependencies = [
37
37
  "sentence-transformers>=2.2.0",
38
38
  "mcp>=1.0.0",
39
39
  "anthropic>=0.40.0",
40
+ "spacy>=3.5.0",
40
41
  ]
41
42
 
42
43
  [project.optional-dependencies]
@@ -227,8 +227,9 @@ class Promoter:
227
227
  content=content
228
228
  )
229
229
 
230
- # Extrai entidades do frontmatter e registra no grafo
230
+ # Extrai entidades do frontmatter E do conteúdo
231
231
  self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
232
+ self._extract_entities_from_content(draft_id, content)
232
233
 
233
234
  return PromotionResult(
234
235
  success=True,
@@ -307,8 +308,9 @@ class Promoter:
307
308
  content=body
308
309
  )
309
310
 
310
- # Extrai entidades do frontmatter e registra no grafo
311
+ # Extrai entidades do frontmatter E do conteúdo
311
312
  self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
313
+ self._extract_entities_from_content(draft_id, body)
312
314
 
313
315
  return PromotionResult(
314
316
  success=True,
@@ -477,3 +479,27 @@ class Promoter:
477
479
  frontmatter=frontmatter,
478
480
  project=project
479
481
  )
482
+
483
+ def _extract_entities_from_content(
484
+ self,
485
+ memory_id: str,
486
+ content: str
487
+ ) -> List[str]:
488
+ """
489
+ Extrai entidades do conteúdo usando spaCy NER e registra no grafo.
490
+
491
+ Args:
492
+ memory_id: ID da memória
493
+ content: Conteúdo de texto
494
+
495
+ Returns:
496
+ Lista de IDs de entidades criadas
497
+ """
498
+ if not self.entities_db:
499
+ return []
500
+
501
+ return self.entities_db.extract_from_content(
502
+ memory_id=memory_id,
503
+ content=content,
504
+ use_spacy=True
505
+ )
@@ -50,6 +50,7 @@ class EntitiesDB:
50
50
  memory_id TEXT,
51
51
  entity_name TEXT,
52
52
  entity_type TEXT,
53
+ source TEXT DEFAULT 'content',
53
54
  confidence REAL DEFAULT 1.0,
54
55
  span_start INTEGER,
55
56
  span_end INTEGER,
@@ -75,6 +76,26 @@ class EntitiesDB:
75
76
  ON entities(memory_id)
76
77
  """)
77
78
 
79
+ conn.execute("""
80
+ CREATE INDEX IF NOT EXISTS idx_entities_source
81
+ ON entities(source)
82
+ """)
83
+
84
+ # Tabela de cache de hash (para evitar reprocessamento)
85
+ conn.execute("""
86
+ CREATE TABLE IF NOT EXISTS entity_cache (
87
+ memory_id TEXT PRIMARY KEY,
88
+ content_hash TEXT,
89
+ processed_at TEXT DEFAULT (datetime('now')),
90
+ entity_count INTEGER DEFAULT 0
91
+ )
92
+ """)
93
+
94
+ conn.execute("""
95
+ CREATE INDEX IF NOT EXISTS idx_entity_cache_hash
96
+ ON entity_cache(content_hash)
97
+ """)
98
+
78
99
  # Tabela de relacionamentos
79
100
  conn.execute("""
80
101
  CREATE TABLE IF NOT EXISTS entity_relationships (
@@ -113,7 +134,8 @@ class EntitiesDB:
113
134
  confidence: float = 1.0,
114
135
  span_start: int = 0,
115
136
  span_end: int = 0,
116
- context_snippet: str = ""
137
+ context_snippet: str = "",
138
+ source: str = "content"
117
139
  ) -> str:
118
140
  """
119
141
  Insere uma entidade.
@@ -126,6 +148,7 @@ class EntitiesDB:
126
148
  span_start: Posição inicial no texto
127
149
  span_end: Posição final no texto
128
150
  context_snippet: Contexto ao redor da entidade
151
+ source: Origem da entidade ('frontmatter' ou 'content')
129
152
 
130
153
  Returns:
131
154
  ID da entidade
@@ -135,13 +158,14 @@ class EntitiesDB:
135
158
  conn = self._connect()
136
159
  conn.execute("""
137
160
  INSERT OR REPLACE INTO entities
138
- (id, memory_id, entity_name, entity_type, confidence, span_start, span_end, context_snippet)
139
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
161
+ (id, memory_id, entity_name, entity_type, source, confidence, span_start, span_end, context_snippet)
162
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
140
163
  """, (
141
164
  entity_id,
142
165
  memory_id,
143
166
  entity_name,
144
167
  entity_type,
168
+ source,
145
169
  confidence,
146
170
  span_start,
147
171
  span_end,
@@ -192,7 +216,7 @@ class EntitiesDB:
192
216
 
193
217
  def delete_entities_by_memory(self, memory_id: str) -> int:
194
218
  """
195
- Remove entidades de uma memória.
219
+ Remove todas as entidades de uma memória.
196
220
 
197
221
  Args:
198
222
  memory_id: ID da memória
@@ -217,6 +241,27 @@ class EntitiesDB:
217
241
  conn.close()
218
242
  return deleted
219
243
 
244
+ def delete_entities_by_source(self, memory_id: str, source: str) -> int:
245
+ """
246
+ Remove entidades de uma memória por fonte (frontmatter ou content).
247
+
248
+ Args:
249
+ memory_id: ID da memória
250
+ source: Fonte das entidades ('frontmatter' ou 'content')
251
+
252
+ Returns:
253
+ Número de entidades removidas
254
+ """
255
+ conn = self._connect()
256
+ cursor = conn.execute(
257
+ "DELETE FROM entities WHERE memory_id = ? AND source = ?",
258
+ (memory_id, source)
259
+ )
260
+ deleted = cursor.rowcount
261
+ conn.commit()
262
+ conn.close()
263
+ return deleted
264
+
220
265
  # ========================================================================
221
266
  # OPERAÇÕES DE RELACIONAMENTOS
222
267
  # ========================================================================
@@ -506,7 +551,8 @@ class EntitiesDB:
506
551
  memory_id,
507
552
  f"TYPE:{frontmatter['type']}",
508
553
  "META",
509
- confidence=1.0
554
+ confidence=1.0,
555
+ source="frontmatter"
510
556
  )
511
557
  entity_ids.append(eid)
512
558
 
@@ -516,7 +562,8 @@ class EntitiesDB:
516
562
  memory_id,
517
563
  project,
518
564
  "PROJECT",
519
- confidence=1.0
565
+ confidence=1.0,
566
+ source="frontmatter"
520
567
  )
521
568
  entity_ids.append(eid)
522
569
 
@@ -529,7 +576,8 @@ class EntitiesDB:
529
576
  memory_id,
530
577
  f"TAG:{tag}",
531
578
  "TAG",
532
- confidence=1.0
579
+ confidence=1.0,
580
+ source="frontmatter"
533
581
  )
534
582
  entity_ids.append(eid)
535
583
  elif isinstance(tags, list):
@@ -538,12 +586,239 @@ class EntitiesDB:
538
586
  memory_id,
539
587
  f"TAG:{tag}",
540
588
  "TAG",
541
- confidence=1.0
589
+ confidence=1.0,
590
+ source="frontmatter"
542
591
  )
543
592
  entity_ids.append(eid)
544
593
 
545
594
  return entity_ids
546
595
 
596
+ def extract_from_content(
597
+ self,
598
+ memory_id: str,
599
+ content: str,
600
+ use_spacy: bool = True
601
+ ) -> List[str]:
602
+ """
603
+ Extrai entidades do conteúdo usando spaCy NER.
604
+
605
+ Args:
606
+ memory_id: ID da memória
607
+ content: Conteúdo de texto
608
+ use_spacy: Usar spaCy (padrão: True)
609
+
610
+ Returns:
611
+ Lista de IDs de entidades criadas
612
+ """
613
+ # Verifica cache - só processa se conteúdo mudou
614
+ cached_hash = self.get_cached_hash(memory_id)
615
+ current_hash = self._compute_hash(content)
616
+
617
+ if cached_hash == current_hash:
618
+ # Conteúdo igual, verifica se já tem entidades
619
+ existing = self.get_entities_by_memory(memory_id)
620
+ if existing:
621
+ return [] # Já processado, sem mudanças
622
+
623
+ # Conteúdo novo ou mudou - remove apenas entidades de conteúdo e reprocessa
624
+ self.delete_entities_by_source(memory_id, "content")
625
+
626
+ entity_ids = []
627
+
628
+ if use_spacy:
629
+ try:
630
+ import spacy
631
+ from spacy import Language
632
+
633
+ # Carrega modelo (download na primeira vez)
634
+ try:
635
+ nlp: Language = spacy.load("pt_core_news_sm")
636
+ except OSError:
637
+ # Modelo não instalado - tenta instalar
638
+ import subprocess
639
+ subprocess.run(
640
+ ["python", "-m", "spacy", "download", "pt_core_news_sm"],
641
+ capture_output=True
642
+ )
643
+ nlp = spacy.load("pt_core_news_sm")
644
+
645
+ # Processa texto
646
+ doc = nlp(content[:5000]) # Limita a 5000 chars para performance
647
+
648
+ # Mapeia labels do spaCy para nossos tipos
649
+ label_map = {
650
+ "ORG": "ORG",
651
+ "PERSON": "PERSON",
652
+ "GPE": "LOC",
653
+ "LOC": "LOC",
654
+ "PRODUCT": "PRODUCT",
655
+ "EVENT": "EVENT",
656
+ "WORK_OF_ART": "PRODUCT",
657
+ "LAW": "PRODUCT",
658
+ "LANGUAGE": "TECH",
659
+ }
660
+
661
+ for ent in doc.ents:
662
+ if ent.label_ in label_map:
663
+ entity_type = label_map[ent.label_]
664
+ confidence = float(ent._.get("nerd_score", 0.8)) if hasattr(ent._, "nerd_score") else 0.8
665
+
666
+ eid = self.insert_entity(
667
+ memory_id,
668
+ ent.text,
669
+ entity_type,
670
+ confidence=confidence,
671
+ span_start=ent.start_char,
672
+ span_end=ent.end_char,
673
+ context_snippet=content[max(0, ent.start_char - 25):ent.end_char + 25][:50]
674
+ )
675
+ entity_ids.append(eid)
676
+
677
+ # Atualiza cache após processamento
678
+ self.update_cache(memory_id, content, len(entity_ids))
679
+
680
+ except ImportError:
681
+ # spaCy não disponível - usa fallback simples
682
+ entity_ids.extend(self._extract_entities_fallback(memory_id, content))
683
+ # Atualiza cache após processamento
684
+ self.update_cache(memory_id, content, len(entity_ids))
685
+ except Exception:
686
+ # Falha silenciosa - não quebra o fluxo
687
+ pass
688
+ else:
689
+ entity_ids.extend(self._extract_entities_fallback(memory_id, content))
690
+ # Atualiza cache após processamento
691
+ self.update_cache(memory_id, content, len(entity_ids))
692
+
693
+ return entity_ids
694
+
695
+ def _extract_entities_fallback(
696
+ self,
697
+ memory_id: str,
698
+ content: str
699
+ ) -> List[str]:
700
+ """
701
+ Fallback sem spaCy - usa heurísticas simples.
702
+
703
+ Args:
704
+ memory_id: ID da memória
705
+ content: Conteúdo de texto
706
+
707
+ Returns:
708
+ Lista de IDs de entidades criadas
709
+ """
710
+ entity_ids = []
711
+
712
+ # Palavras capitalizadas (prováveis nomes próprios/ORGs)
713
+ capitalized = re.findall(r'\b[A-Z][a-zA-Z]{2,}\b', content[:2000])
714
+
715
+ # Filtra termos comuns que não são entidades
716
+ common_words = {
717
+ "The", "This", "That", "These", "Those", "What", "When", "Where", "Why", "How",
718
+ "Para", "Com", "Por", "Em", "De", "Do", "Da", "Dos", "Das", "Uma", "Um",
719
+ "Como", "Quando", "Onde", "Qual", "Quais", "Quem", "Sobre"
720
+ }
721
+
722
+ entities = set(e for e in capitalized if e not in common_words)
723
+
724
+ for entity in list(entities)[:20]: # Limita a 20 entidades
725
+ eid = self.insert_entity(
726
+ memory_id,
727
+ entity,
728
+ "ORG" if entity[0].isupper() else "PERSON",
729
+ confidence=0.5, # Baixa confiança sem spaCy
730
+ context_snippet=entity
731
+ )
732
+ entity_ids.append(eid)
733
+
734
+ return entity_ids
735
+
736
+ # ========================================================================
737
+ # CACHE DE HASH (para evitar reprocessamento)
738
+ # ========================================================================
739
+
740
+ def _compute_hash(self, content: str) -> str:
741
+ """
742
+ Computa hash do conteúdo.
743
+
744
+ Args:
745
+ content: Conteúdo de texto
746
+
747
+ Returns:
748
+ Hash SHA256 (primeiros 16 chars)
749
+ """
750
+ import hashlib
751
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
752
+
753
+ def get_cached_hash(self, memory_id: str) -> Optional[str]:
754
+ """
755
+ Obtém hash em cache de uma memória.
756
+
757
+ Args:
758
+ memory_id: ID da memória
759
+
760
+ Returns:
761
+ Hash armazenado ou None
762
+ """
763
+ conn = self._connect()
764
+ cursor = conn.execute(
765
+ "SELECT content_hash FROM entity_cache WHERE memory_id = ?",
766
+ (memory_id,)
767
+ )
768
+ row = cursor.fetchone()
769
+ conn.close()
770
+ return row["content_hash"] if row else None
771
+
772
+ def is_content_changed(self, memory_id: str, content: str) -> bool:
773
+ """
774
+ Verifica se conteúdo mudou desde último processamento.
775
+
776
+ Args:
777
+ memory_id: ID da memória
778
+ content: Conteúdo de texto
779
+
780
+ Returns:
781
+ True se mudou, False se igual
782
+ """
783
+ current_hash = self._compute_hash(content)
784
+ cached_hash = self.get_cached_hash(memory_id)
785
+ return cached_hash != current_hash
786
+
787
+ def update_cache(self, memory_id: str, content: str, entity_count: int) -> None:
788
+ """
789
+ Atualiza cache de hash após processamento.
790
+
791
+ Args:
792
+ memory_id: ID da memória
793
+ content: Conteúdo de texto
794
+ entity_count: Número de entidades extraídas
795
+ """
796
+ content_hash = self._compute_hash(content)
797
+
798
+ conn = self._connect()
799
+ conn.execute("""
800
+ INSERT OR REPLACE INTO entity_cache
801
+ (memory_id, content_hash, processed_at, entity_count)
802
+ VALUES (?, ?, datetime('now'), ?)
803
+ """, (memory_id, content_hash, entity_count))
804
+ conn.commit()
805
+ conn.close()
806
+
807
+ def clear_cache(self, memory_id: str) -> None:
808
+ """
809
+ Limpa cache de uma memória.
810
+
811
+ Args:
812
+ memory_id: ID da memória
813
+ """
814
+ conn = self._connect()
815
+ conn.execute(
816
+ "DELETE FROM entity_cache WHERE memory_id = ?",
817
+ (memory_id,)
818
+ )
819
+ conn.commit()
820
+ conn.close()
821
+
547
822
  # ========================================================================
548
823
  # ESTATÍSTICAS
549
824
  # ========================================================================
@@ -87,6 +87,7 @@ class QueryEngine:
87
87
  if use_fts:
88
88
  fts_results = self._search_fts(query, project, mem_type, limit)
89
89
  for r in fts_results:
90
+ r.score *= fts_weight # Aplica peso FTS desde o início
90
91
  results[r.memory_id] = r
91
92
 
92
93
  # Busca Semantic
@@ -94,12 +95,9 @@ class QueryEngine:
94
95
  semantic_results = self._search_semantic(query, project, limit)
95
96
  for r in semantic_results:
96
97
  if r.memory_id in results:
97
- # Combina scores
98
+ # Combina scores: média ponderada (FTS já tem peso aplicado)
98
99
  existing = results[r.memory_id]
99
- combined_score = (
100
- existing.score * fts_weight +
101
- r.score * semantic_weight
102
- )
100
+ combined_score = existing.score + (r.score * semantic_weight)
103
101
  results[r.memory_id] = QueryResult(
104
102
  memory_id=r.memory_id,
105
103
  type=r.type,
@@ -118,12 +116,9 @@ class QueryEngine:
118
116
  graph_results = self._search_by_graph(query, limit)
119
117
  for r in graph_results:
120
118
  if r.memory_id in results:
121
- # Combina scores
119
+ # Combina scores: soma ponderada (scores anteriores já têm peso)
122
120
  existing = results[r.memory_id]
123
- combined_score = (
124
- existing.score * (1 - graph_weight) +
125
- r.score * graph_weight
126
- )
121
+ combined_score = existing.score + (r.score * graph_weight)
127
122
  results[r.memory_id] = QueryResult(
128
123
  memory_id=r.memory_id,
129
124
  type=r.type,
package/src/mcp/server.py CHANGED
@@ -532,7 +532,8 @@ class CerebroMCP:
532
532
  "",
533
533
  "Índice:",
534
534
  f" Metadata DB: {self.cerebro_path / 'index' / 'metadata.db'}",
535
- f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}"
535
+ f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}",
536
+ f" Entities DB: {self.cerebro_path / 'index' / 'entities.db'}"
536
537
  ]
537
538
 
538
539
  return "\n".join(lines)
@@ -712,8 +713,14 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
712
713
 
713
714
  desc_match = re.search(r'description:\s*(.*)', content)
714
715
  type_match = re.search(r'type:\s*(.*)', content)
716
+ project_match = re.search(r'project:\s*(.*)', content)
717
+ tags_match = re.search(r'tags:\s*(.*)', content)
718
+
715
719
  desc = desc_match.group(1).strip() if desc_match else "sem descrição"
716
720
  m_type = type_match.group(1).strip() if type_match else "project"
721
+ project = project_match.group(1).strip() if project_match else "unknown"
722
+ tags = tags_match.group(1).strip() if tags_match else ""
723
+
717
724
  ts = datetime.now().strftime("%Y-%m-%d")
718
725
  entry = f"- [{m_type}] {mem_name}.md ({ts}): {desc}\n"
719
726
 
@@ -726,6 +733,31 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
726
733
  else:
727
734
  index_path.write_text(f"# Memórias do Projeto\n\n{entry}", encoding="utf-8")
728
735
 
736
+ # BUG FIX: Registrar entidades no grafo (frontmatter + conteúdo)
737
+ # ORDEM IMPORTANTE: content primeiro, frontmatter depois
738
+ # extract_from_content() deleta entidades existentes, então frontmatter deve vir após
739
+ if self.entities_db:
740
+ import yaml
741
+ frontmatter_match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
742
+ if frontmatter_match:
743
+ try:
744
+ frontmatter = yaml.safe_load(frontmatter_match.group(1))
745
+ body_content = frontmatter_match.group(2)
746
+ # 1. Extrai entidades do conteúdo (spaCy NER) - pode deletar existentes
747
+ self.entities_db.extract_from_content(
748
+ memory_id=mem_name,
749
+ content=body_content,
750
+ use_spacy=True
751
+ )
752
+ # 2. Extrai entidades do frontmatter - NÃO são deletadas
753
+ self.entities_db.extract_from_frontmatter(
754
+ memory_id=mem_name,
755
+ frontmatter=frontmatter or {},
756
+ project=project
757
+ )
758
+ except Exception as e:
759
+ pass # Falha silenciosa se frontmatter inválido
760
+
729
761
  return f"✅ Memória '{mem_name}' salva em {file_path}"
730
762
 
731
763
  def _remember(self, args: Dict[str, Any]) -> str: