npm - ocerebro - Versions diffs - 0.3.0 → 0.3.1 - Mend

ocerebro 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/cerebro/index/embeddings.db +0 -0
package/cerebro/index/entities.db +0 -0
package/cerebro/index/metadata.db +0 -0
package/package.json +1 -1
package/pyproject.toml +2 -1
package/src/consolidation/promoter.py +28 -2
package/src/index/entities_db.py +241 -0
package/src/index/queries.py +5 -10
package/src/mcp/server.py +31 -1

package/cerebro/index/embeddings.db ADDED Viewed

Binary file

package/cerebro/index/entities.db ADDED Viewed

Binary file

package/cerebro/index/metadata.db ADDED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "ocerebro",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "description": "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)",
   "main": "bin/ocerebro.js",
   "bin": {

package/pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ocerebro"
-version = "0.3.0"
+version = "0.3.1"
 description = "OCerebro - Sistema de Memoria para Agentes (Claude Code/MCP)"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -37,6 +37,7 @@ dependencies = [
     "sentence-transformers>=2.2.0",
     "mcp>=1.0.0",
     "anthropic>=0.40.0",
+    "spacy>=3.5.0",
 ]
 [project.optional-dependencies]

package/src/consolidation/promoter.py CHANGED Viewed

@@ -227,8 +227,9 @@ class Promoter:
             content=content
         )
-        # Extrai entidades do frontmatter e registra no grafo
+        # Extrai entidades do frontmatter E do conteúdo
         self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
+        self._extract_entities_from_content(draft_id, content)
         return PromotionResult(
             success=True,
@@ -307,8 +308,9 @@ class Promoter:
             content=body
         )
-        # Extrai entidades do frontmatter e registra no grafo
+        # Extrai entidades do frontmatter E do conteúdo
         self._extract_entities_from_frontmatter(draft_id, frontmatter, project)
+        self._extract_entities_from_content(draft_id, body)
         return PromotionResult(
             success=True,
@@ -477,3 +479,27 @@ class Promoter:
             frontmatter=frontmatter,
             project=project
         )
+    def _extract_entities_from_content(
+        self,
+        memory_id: str,
+        content: str
+    ) -> List[str]:
+        """
+        Extrai entidades do conteúdo usando spaCy NER e registra no grafo.
+        Args:
+            memory_id: ID da memória
+            content: Conteúdo de texto
+        Returns:
+            Lista de IDs de entidades criadas
+        """
+        if not self.entities_db:
+            return []
+        return self.entities_db.extract_from_content(
+            memory_id=memory_id,
+            content=content,
+            use_spacy=True
+        )

package/src/index/entities_db.py CHANGED Viewed

@@ -75,6 +75,21 @@ class EntitiesDB:
             ON entities(memory_id)
         """)
+        # Tabela de cache de hash (para evitar reprocessamento)
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS entity_cache (
+                memory_id TEXT PRIMARY KEY,
+                content_hash TEXT,
+                processed_at TEXT DEFAULT (datetime('now')),
+                entity_count INTEGER DEFAULT 0
+            )
+        """)
+        conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_entity_cache_hash
+            ON entity_cache(content_hash)
+        """)
         # Tabela de relacionamentos
         conn.execute("""
             CREATE TABLE IF NOT EXISTS entity_relationships (
@@ -544,6 +559,232 @@ class EntitiesDB:
         return entity_ids
+    def extract_from_content(
+        self,
+        memory_id: str,
+        content: str,
+        use_spacy: bool = True
+    ) -> List[str]:
+        """
+        Extrai entidades do conteúdo usando spaCy NER.
+        Args:
+            memory_id: ID da memória
+            content: Conteúdo de texto
+            use_spacy: Usar spaCy (padrão: True)
+        Returns:
+            Lista de IDs de entidades criadas
+        """
+        # Verifica cache - só processa se conteúdo mudou
+        cached_hash = self.get_cached_hash(memory_id)
+        current_hash = self._compute_hash(content)
+        if cached_hash == current_hash:
+            # Conteúdo igual, verifica se já tem entidades
+            existing = self.get_entities_by_memory(memory_id)
+            if existing:
+                return []  # Já processado, sem mudanças
+        # Conteúdo novo ou mudou - remove entidades antigas e reprocessa
+        self.delete_entities_by_memory(memory_id)
+        entity_ids = []
+        if use_spacy:
+            try:
+                import spacy
+                from spacy import Language
+                # Carrega modelo (download na primeira vez)
+                try:
+                    nlp: Language = spacy.load("pt_core_news_sm")
+                except OSError:
+                    # Modelo não instalado - tenta instalar
+                    import subprocess
+                    subprocess.run(
+                        ["python", "-m", "spacy", "download", "pt_core_news_sm"],
+                        capture_output=True
+                    )
+                    nlp = spacy.load("pt_core_news_sm")
+                # Processa texto
+                doc = nlp(content[:5000])  # Limita a 5000 chars para performance
+                # Mapeia labels do spaCy para nossos tipos
+                label_map = {
+                    "ORG": "ORG",
+                    "PERSON": "PERSON",
+                    "GPE": "LOC",
+                    "LOC": "LOC",
+                    "PRODUCT": "PRODUCT",
+                    "EVENT": "EVENT",
+                    "WORK_OF_ART": "PRODUCT",
+                    "LAW": "PRODUCT",
+                    "LANGUAGE": "TECH",
+                }
+                for ent in doc.ents:
+                    if ent.label_ in label_map:
+                        entity_type = label_map[ent.label_]
+                        confidence = float(ent._.get("nerd_score", 0.8)) if hasattr(ent._, "nerd_score") else 0.8
+                        eid = self.insert_entity(
+                            memory_id,
+                            ent.text,
+                            entity_type,
+                            confidence=confidence,
+                            span_start=ent.start_char,
+                            span_end=ent.end_char,
+                            context_snippet=content[max(0, ent.start_char - 25):ent.end_char + 25][:50]
+                        )
+                        entity_ids.append(eid)
+                # Atualiza cache após processamento
+                self.update_cache(memory_id, content, len(entity_ids))
+            except ImportError:
+                # spaCy não disponível - usa fallback simples
+                entity_ids.extend(self._extract_entities_fallback(memory_id, content))
+                # Atualiza cache após processamento
+                self.update_cache(memory_id, content, len(entity_ids))
+            except Exception:
+                # Falha silenciosa - não quebra o fluxo
+                pass
+        else:
+            entity_ids.extend(self._extract_entities_fallback(memory_id, content))
+            # Atualiza cache após processamento
+            self.update_cache(memory_id, content, len(entity_ids))
+        return entity_ids
+    def _extract_entities_fallback(
+        self,
+        memory_id: str,
+        content: str
+    ) -> List[str]:
+        """
+        Fallback sem spaCy - usa heurísticas simples.
+        Args:
+            memory_id: ID da memória
+            content: Conteúdo de texto
+        Returns:
+            Lista de IDs de entidades criadas
+        """
+        entity_ids = []
+        # Palavras capitalizadas (prováveis nomes próprios/ORGs)
+        capitalized = re.findall(r'\b[A-Z][a-zA-Z]{2,}\b', content[:2000])
+        # Filtra termos comuns que não são entidades
+        common_words = {
+            "The", "This", "That", "These", "Those", "What", "When", "Where", "Why", "How",
+            "Para", "Com", "Por", "Em", "De", "Do", "Da", "Dos", "Das", "Uma", "Um",
+            "Como", "Quando", "Onde", "Qual", "Quais", "Quem", "Sobre"
+        }
+        entities = set(e for e in capitalized if e not in common_words)
+        for entity in list(entities)[:20]:  # Limita a 20 entidades
+            eid = self.insert_entity(
+                memory_id,
+                entity,
+                "ORG" if entity[0].isupper() else "PERSON",
+                confidence=0.5,  # Baixa confiança sem spaCy
+                context_snippet=entity
+            )
+            entity_ids.append(eid)
+        return entity_ids
+    # ========================================================================
+    # CACHE DE HASH (para evitar reprocessamento)
+    # ========================================================================
+    def _compute_hash(self, content: str) -> str:
+        """
+        Computa hash do conteúdo.
+        Args:
+            content: Conteúdo de texto
+        Returns:
+            Hash SHA256 (primeiros 16 chars)
+        """
+        import hashlib
+        return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
+    def get_cached_hash(self, memory_id: str) -> Optional[str]:
+        """
+        Obtém hash em cache de uma memória.
+        Args:
+            memory_id: ID da memória
+        Returns:
+            Hash armazenado ou None
+        """
+        conn = self._connect()
+        cursor = conn.execute(
+            "SELECT content_hash FROM entity_cache WHERE memory_id = ?",
+            (memory_id,)
+        )
+        row = cursor.fetchone()
+        conn.close()
+        return row["content_hash"] if row else None
+    def is_content_changed(self, memory_id: str, content: str) -> bool:
+        """
+        Verifica se conteúdo mudou desde último processamento.
+        Args:
+            memory_id: ID da memória
+            content: Conteúdo de texto
+        Returns:
+            True se mudou, False se igual
+        """
+        current_hash = self._compute_hash(content)
+        cached_hash = self.get_cached_hash(memory_id)
+        return cached_hash != current_hash
+    def update_cache(self, memory_id: str, content: str, entity_count: int) -> None:
+        """
+        Atualiza cache de hash após processamento.
+        Args:
+            memory_id: ID da memória
+            content: Conteúdo de texto
+            entity_count: Número de entidades extraídas
+        """
+        content_hash = self._compute_hash(content)
+        conn = self._connect()
+        conn.execute("""
+            INSERT OR REPLACE INTO entity_cache
+            (memory_id, content_hash, processed_at, entity_count)
+            VALUES (?, ?, datetime('now'), ?)
+        """, (memory_id, content_hash, entity_count))
+        conn.commit()
+        conn.close()
+    def clear_cache(self, memory_id: str) -> None:
+        """
+        Limpa cache de uma memória.
+        Args:
+            memory_id: ID da memória
+        """
+        conn = self._connect()
+        conn.execute(
+            "DELETE FROM entity_cache WHERE memory_id = ?",
+            (memory_id,)
+        )
+        conn.commit()
+        conn.close()
     # ========================================================================
     # ESTATÍSTICAS
     # ========================================================================

package/src/index/queries.py CHANGED Viewed

@@ -87,6 +87,7 @@ class QueryEngine:
         if use_fts:
             fts_results = self._search_fts(query, project, mem_type, limit)
             for r in fts_results:
+                r.score *= fts_weight  # Aplica peso FTS desde o início
                 results[r.memory_id] = r
         # Busca Semantic
@@ -94,12 +95,9 @@ class QueryEngine:
             semantic_results = self._search_semantic(query, project, limit)
             for r in semantic_results:
                 if r.memory_id in results:
-                    # Combina scores
+                    # Combina scores: média ponderada (FTS já tem peso aplicado)
                     existing = results[r.memory_id]
-                    combined_score = (
-                        existing.score * fts_weight +
-                        r.score * semantic_weight
-                    )
+                    combined_score = existing.score + (r.score * semantic_weight)
                     results[r.memory_id] = QueryResult(
                         memory_id=r.memory_id,
                         type=r.type,
@@ -118,12 +116,9 @@ class QueryEngine:
             graph_results = self._search_by_graph(query, limit)
             for r in graph_results:
                 if r.memory_id in results:
-                    # Combina scores
+                    # Combina scores: soma ponderada (scores anteriores já têm peso)
                     existing = results[r.memory_id]
-                    combined_score = (
-                        existing.score * (1 - graph_weight) +
-                        r.score * graph_weight
-                    )
+                    combined_score = existing.score + (r.score * graph_weight)
                     results[r.memory_id] = QueryResult(
                         memory_id=r.memory_id,
                         type=r.type,

package/src/mcp/server.py CHANGED Viewed

@@ -532,7 +532,8 @@ class CerebroMCP:
             "",
             "Índice:",
             f"  Metadata DB: {self.cerebro_path / 'index' / 'metadata.db'}",
-            f"  Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}"
+            f"  Embeddings DB: {self.cerebro_path / 'index' / 'embeddings.db'}",
+            f"  Entities DB: {self.cerebro_path / 'index' / 'entities.db'}"
         ]
         return "\n".join(lines)
@@ -712,8 +713,14 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
         desc_match = re.search(r'description:\s*(.*)', content)
         type_match = re.search(r'type:\s*(.*)', content)
+        project_match = re.search(r'project:\s*(.*)', content)
+        tags_match = re.search(r'tags:\s*(.*)', content)
         desc = desc_match.group(1).strip() if desc_match else "sem descrição"
         m_type = type_match.group(1).strip() if type_match else "project"
+        project = project_match.group(1).strip() if project_match else "unknown"
+        tags = tags_match.group(1).strip() if tags_match else ""
         ts = datetime.now().strftime("%Y-%m-%d")
         entry = f"- [{m_type}] {mem_name}.md ({ts}): {desc}\n"
@@ -726,6 +733,29 @@ Uma chamada por memória. O sistema salva e indexa automaticamente.
         else:
             index_path.write_text(f"# Memórias do Projeto\n\n{entry}", encoding="utf-8")
+        # BUG FIX: Registrar entidades no grafo (frontmatter + conteúdo)
+        if self.entities_db:
+            import yaml
+            frontmatter_match = re.match(r'^---\n(.*?)\n---\n(.*)$', content, re.DOTALL)
+            if frontmatter_match:
+                try:
+                    frontmatter = yaml.safe_load(frontmatter_match.group(1))
+                    # Extrai entidades do frontmatter
+                    self.entities_db.extract_from_frontmatter(
+                        memory_id=mem_name,
+                        frontmatter=frontmatter or {},
+                        project=project
+                    )
+                    # Extrai entidades do conteúdo (spaCy NER)
+                    body_content = frontmatter_match.group(2)
+                    self.entities_db.extract_from_content(
+                        memory_id=mem_name,
+                        content=body_content,
+                        use_spacy=True
+                    )
+                except Exception as e:
+                    pass  # Falha silenciosa se frontmatter inválido
         return f"✅ Memória '{mem_name}' salva em {file_path}"
     def _remember(self, args: Dict[str, Any]) -> str: