ocerebro 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ocerebro",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "description": "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)",
5
5
  "main": "bin/ocerebro.js",
6
6
  "bin": {
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ocerebro"
7
- version = "0.3.0"
7
+ version = "0.3.1"
8
8
  description = "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -37,6 +37,7 @@ dependencies = [
37
37
  "sentence-transformers>=2.2.0",
38
38
  "mcp>=1.0.0",
39
39
  "anthropic>=0.40.0",
40
+ "spacy>=3.5.0",
40
41
  ]
41
42
 
42
43
  [project.optional-dependencies]
@@ -227,8 +227,9 @@ class Promoter:
227
227
  content=content
228
228
  )
229
229
 
230
- # Extrai entidades do frontmatter e registra no grafo
230
+ # Extrai entidades do frontmatter E do conteúdo
231
231
  self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
232
+ self._extract_entities_from_content(draft_id, content)
232
233
 
233
234
  return PromotionResult(
234
235
  success=True,
@@ -307,8 +308,9 @@ class Promoter:
307
308
  content=body
308
309
  )
309
310
 
310
- # Extrai entidades do frontmatter e registra no grafo
311
+ # Extrai entidades do frontmatter E do conteúdo
311
312
  self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
313
+ self._extract_entities_from_content(draft_id, body)
312
314
 
313
315
  return PromotionResult(
314
316
  success=True,
@@ -477,3 +479,27 @@ class Promoter:
477
479
  frontmatter=frontmatter,
478
480
  project=project
479
481
  )
482
+
483
+ def _extract_entities_from_content(
484
+ self,
485
+ memory_id: str,
486
+ content: str
487
+ ) -> List[str]:
488
+ """
489
+ Extrai entidades do conteúdo usando spaCy NER e registra no grafo.
490
+
491
+ Args:
492
+ memory_id: ID da memória
493
+ content: Conteúdo de texto
494
+
495
+ Returns:
496
+ Lista de IDs de entidades criadas
497
+ """
498
+ if not self.entities_db:
499
+ return []
500
+
501
+ return self.entities_db.extract_from_content(
502
+ memory_id=memory_id,
503
+ content=content,
504
+ use_spacy=True
505
+ )
@@ -75,6 +75,21 @@ class EntitiesDB:
75
75
  ON entities(memory_id)
76
76
  """)
77
77
 
78
+ # Tabela de cache de hash (para evitar reprocessamento)
79
+ conn.execute("""
80
+ CREATE TABLE IF NOT EXISTS entity_cache (
81
+ memory_id TEXT PRIMARY KEY,
82
+ content_hash TEXT,
83
+ processed_at TEXT DEFAULT (datetime('now')),
84
+ entity_count INTEGER DEFAULT 0
85
+ )
86
+ """)
87
+
88
+ conn.execute("""
89
+ CREATE INDEX IF NOT EXISTS idx_entity_cache_hash
90
+ ON entity_cache(content_hash)
91
+ """)
92
+
78
93
  # Tabela de relacionamentos
79
94
  conn.execute("""
80
95
  CREATE TABLE IF NOT EXISTS entity_relationships (
@@ -544,6 +559,232 @@ class EntitiesDB:
544
559
 
545
560
  return entity_ids
546
561
 
562
+ def extract_from_content(
563
+ self,
564
+ memory_id: str,
565
+ content: str,
566
+ use_spacy: bool = True
567
+ ) -> List[str]:
568
+ """
569
+ Extrai entidades do conteúdo usando spaCy NER.
570
+
571
+ Args:
572
+ memory_id: ID da memória
573
+ content: Conteúdo de texto
574
+ use_spacy: Usar spaCy (padrão: True)
575
+
576
+ Returns:
577
+ Lista de IDs de entidades criadas
578
+ """
579
+ # Verifica cache - só processa se conteúdo mudou
580
+ cached_hash = self.get_cached_hash(memory_id)
581
+ current_hash = self._compute_hash(content)
582
+
583
+ if cached_hash == current_hash:
584
+ # Conteúdo igual, verifica se já tem entidades
585
+ existing = self.get_entities_by_memory(memory_id)
586
+ if existing:
587
+ return [] # Já processado, sem mudanças
588
+
589
+ # Conteúdo novo ou mudou - remove entidades antigas e reprocessa
590
+ self.delete_entities_by_memory(memory_id)
591
+
592
+ entity_ids = []
593
+
594
+ if use_spacy:
595
+ try:
596
+ import spacy
597
+ from spacy import Language
598
+
599
+ # Carrega modelo (download na primeira vez)
600
+ try:
601
+ nlp: Language = spacy.load("pt_core_news_sm")
602
+ except OSError:
603
+ # Modelo não instalado - tenta instalar
604
+ import subprocess
605
+ subprocess.run(
606
+ ["python", "-m", "spacy", "download", "pt_core_news_sm"],
607
+ capture_output=True
608
+ )
609
+ nlp = spacy.load("pt_core_news_sm")
610
+
611
+ # Processa texto
612
+ doc = nlp(content[:5000]) # Limita a 5000 chars para performance
613
+
614
+ # Mapeia labels do spaCy para nossos tipos
615
+ label_map = {
616
+ "ORG": "ORG",
617
+ "PERSON": "PERSON",
618
+ "GPE": "LOC",
619
+ "LOC": "LOC",
620
+ "PRODUCT": "PRODUCT",
621
+ "EVENT": "EVENT",
622
+ "WORK_OF_ART": "PRODUCT",
623
+ "LAW": "PRODUCT",
624
+ "LANGUAGE": "TECH",
625
+ }
626
+
627
+ for ent in doc.ents:
628
+ if ent.label_ in label_map:
629
+ entity_type = label_map[ent.label_]
630
+ confidence = float(ent._.get("nerd_score", 0.8)) if hasattr(ent._, "nerd_score") else 0.8
631
+
632
+ eid = self.insert_entity(
633
+ memory_id,
634
+ ent.text,
635
+ entity_type,
636
+ confidence=confidence,
637
+ span_start=ent.start_char,
638
+ span_end=ent.end_char,
639
+ context_snippet=content[max(0, ent.start_char - 25):ent.end_char + 25][:50]
640
+ )
641
+ entity_ids.append(eid)
642
+
643
+ # Atualiza cache após processamento
644
+ self.update_cache(memory_id, content, len(entity_ids))
645
+
646
+ except ImportError:
647
+ # spaCy não disponível - usa fallback simples
648
+ entity_ids.extend(self._extract_entities_fallback(memory_id, content))
649
+ # Atualiza cache após processamento
650
+ self.update_cache(memory_id, content, len(entity_ids))
651
+ except Exception:
652
+ # Falha silenciosa - não quebra o fluxo
653
+ pass
654
+ else:
655
+ entity_ids.extend(self._extract_entities_fallback(memory_id, content))
656
+ # Atualiza cache após processamento
657
+ self.update_cache(memory_id, content, len(entity_ids))
658
+
659
+ return entity_ids
660
+
661
+ def _extract_entities_fallback(
662
+ self,
663
+ memory_id: str,
664
+ content: str
665
+ ) -> List[str]:
666
+ """
667
+ Fallback sem spaCy - usa heurísticas simples.
668
+
669
+ Args:
670
+ memory_id: ID da memória
671
+ content: Conteúdo de texto
672
+
673
+ Returns:
674
+ Lista de IDs de entidades criadas
675
+ """
676
+ entity_ids = []
677
+
678
+ # Palavras capitalizadas (prováveis nomes próprios/ORGs)
679
+ capitalized = re.findall(r'\b[A-Z][a-zA-Z]{2,}\b', content[:2000])
680
+
681
+ # Filtra termos comuns que não são entidades
682
+ common_words = {
683
+ "The", "This", "That", "These", "Those", "What", "When", "Where", "Why", "How",
684
+ "Para", "Com", "Por", "Em", "De", "Do", "Da", "Dos", "Das", "Uma", "Um",
685
+ "Como", "Quando", "Onde", "Qual", "Quais", "Quem", "Sobre"
686
+ }
687
+
688
+ entities = set(e for e in capitalized if e not in common_words)
689
+
690
+ for entity in list(entities)[:20]: # Limita a 20 entidades
691
+ eid = self.insert_entity(
692
+ memory_id,
693
+ entity,
694
+ "ORG" if entity[0].isupper() else "PERSON",
695
+ confidence=0.5, # Baixa confiança sem spaCy
696
+ context_snippet=entity
697
+ )
698
+ entity_ids.append(eid)
699
+
700
+ return entity_ids
701
+
702
+ # ========================================================================
703
+ # CACHE DE HASH (para evitar reprocessamento)
704
+ # ========================================================================
705
+
706
+ def _compute_hash(self, content: str) -> str:
707
+ """
708
+ Computa hash do conteúdo.
709
+
710
+ Args:
711
+ content: Conteúdo de texto
712
+
713
+ Returns:
714
+ Hash SHA256 (primeiros 16 chars)
715
+ """
716
+ import hashlib
717
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
718
+
719
+ def get_cached_hash(self, memory_id: str) -> Optional[str]:
720
+ """
721
+ Obtém hash em cache de uma memória.
722
+
723
+ Args:
724
+ memory_id: ID da memória
725
+
726
+ Returns:
727
+ Hash armazenado ou None
728
+ """
729
+ conn = self._connect()
730
+ cursor = conn.execute(
731
+ "SELECT content_hash FROM entity_cache WHERE memory_id = ?",
732
+ (memory_id,)
733
+ )
734
+ row = cursor.fetchone()
735
+ conn.close()
736
+ return row["content_hash"] if row else None
737
+
738
+ def is_content_changed(self, memory_id: str, content: str) -> bool:
739
+ """
740
+ Verifica se conteúdo mudou desde último processamento.
741
+
742
+ Args:
743
+ memory_id: ID da memória
744
+ content: Conteúdo de texto
745
+
746
+ Returns:
747
+ True se mudou, False se igual
748
+ """
749
+ current_hash = self._compute_hash(content)
750
+ cached_hash = self.get_cached_hash(memory_id)
751
+ return cached_hash != current_hash
752
+
753
+ def update_cache(self, memory_id: str, content: str, entity_count: int) -> None:
754
+ """
755
+ Atualiza cache de hash após processamento.
756
+
757
+ Args:
758
+ memory_id: ID da memória
759
+ content: Conteúdo de texto
760
+ entity_count: Número de entidades extraídas
761
+ """
762
+ content_hash = self._compute_hash(content)
763
+
764
+ conn = self._connect()
765
+ conn.execute("""
766
+ INSERT OR REPLACE INTO entity_cache
767
+ (memory_id, content_hash, processed_at, entity_count)
768
+ VALUES (?, ?, datetime('now'), ?)
769
+ """, (memory_id, content_hash, entity_count))
770
+ conn.commit()
771
+ conn.close()
772
+
773
+ def clear_cache(self, memory_id: str) -> None:
774
+ """
775
+ Limpa cache de uma memória.
776
+
777
+ Args:
778
+ memory_id: ID da memória
779
+ """
780
+ conn = self._connect()
781
+ conn.execute(
782
+ "DELETE FROM entity_cache WHERE memory_id = ?",
783
+ (memory_id,)
784
+ )
785
+ conn.commit()
786
+ conn.close()
787
+
547
788
  # ========================================================================
548
789
  # ESTATÍSTICAS
549
790
  # ========================================================================
@@ -87,6 +87,7 @@ class QueryEngine:
87
87
  if use_fts:
88
88
  fts_results = self._search_fts(query, project, mem_type, limit)
89
89
  for r in fts_results:
90
+ r.score *= fts_weight # Aplica peso FTS desde o início
90
91
  results[r.memory_id] = r
91
92
 
92
93
  # Busca Semantic
@@ -94,12 +95,9 @@ class QueryEngine:
94
95
  semantic_results = self._search_semantic(query, project, limit)
95
96
  for r in semantic_results:
96
97
  if r.memory_id in results:
97
- # Combina scores
98
+ # Combina scores: média ponderada (FTS já tem peso aplicado)
98
99
  existing = results[r.memory_id]
99
- combined_score = (
100
- existing.score * fts_weight +
101
- r.score * semantic_weight
102
- )
100
+ combined_score = existing.score + (r.score * semantic_weight)
103
101
  results[r.memory_id] = QueryResult(
104
102
  memory_id=r.memory_id,
105
103
  type=r.type,
@@ -118,12 +116,9 @@ class QueryEngine:
118
116
  graph_results = self._search_by_graph(query, limit)
119
117
  for r in graph_results:
120
118
  if r.memory_id in results:
121
- # Combina scores
119
+ # Combina scores: soma ponderada (scores anteriores já têm peso)
122
120
  existing = results[r.memory_id]
123
- combined_score = (
124
- existing.score * (1 - graph_weight) +
125
- r.score * graph_weight
126
- )
121
+ combined_score = existing.score + (r.score * graph_weight)
127
122
  results[r.memory_id] = QueryResult(
128
123
  memory_id=r.memory_id,
129
124
  type=r.type,
package/src/mcp/server.py CHANGED
@@ -532,7 +532,8 @@ class CerebroMCP:
532
532
  "",
533
533
  "Índice:",
534
534
  f" Metadata DB: {self.cerebro_path / 'index' / 'metadata.db'}",
535
- f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}"
535
+ f" Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}",
536
+ f" Entities DB: {self.cerebro_path / 'index' / 'entities.db'}"
536
537
  ]
537
538
 
538
539
  return "\n".join(lines)
@@ -712,8 +713,14 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
712
713
 
713
714
  desc_match = re.search(r'description:\s*(.*)', content)
714
715
  type_match = re.search(r'type:\s*(.*)', content)
716
+ project_match = re.search(r'project:\s*(.*)', content)
717
+ tags_match = re.search(r'tags:\s*(.*)', content)
718
+
715
719
  desc = desc_match.group(1).strip() if desc_match else "sem descrição"
716
720
  m_type = type_match.group(1).strip() if type_match else "project"
721
+ project = project_match.group(1).strip() if project_match else "unknown"
722
+ tags = tags_match.group(1).strip() if tags_match else ""
723
+
717
724
  ts = datetime.now().strftime("%Y-%m-%d")
718
725
  entry = f"- [{m_type}] {mem_name}.md ({ts}): {desc}\n"
719
726
 
@@ -726,6 +733,29 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
726
733
  else:
727
734
  index_path.write_text(f"# Memórias do Projeto\n\n{entry}", encoding="utf-8")
728
735
 
736
+ # BUG FIX: Registrar entidades no grafo (frontmatter + conteúdo)
737
+ if self.entities_db:
738
+ import yaml
739
+ frontmatter_match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
740
+ if frontmatter_match:
741
+ try:
742
+ frontmatter = yaml.safe_load(frontmatter_match.group(1))
743
+ # Extrai entidades do frontmatter
744
+ self.entities_db.extract_from_frontmatter(
745
+ memory_id=mem_name,
746
+ frontmatter=frontmatter or {},
747
+ project=project
748
+ )
749
+ # Extrai entidades do conteúdo (spaCy NER)
750
+ body_content = frontmatter_match.group(2)
751
+ self.entities_db.extract_from_content(
752
+ memory_id=mem_name,
753
+ content=body_content,
754
+ use_spacy=True
755
+ )
756
+ except Exception as e:
757
+ pass # Falha silenciosa se frontmatter inválido
758
+
729
759
  return f"✅ Memória '{mem_name}' salva em {file_path}"
730
760
 
731
761
  def _remember(self, args: Dict[str, Any]) -> str: