PyPI - cognee - Versions diffs - 0.2.2.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

cognee 0.2.2.dev0py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

cognee/api/client.py CHANGED Viewed

@@ -16,6 +16,7 @@ from fastapi.openapi.utils import get_openapi
 from cognee.exceptions import CogneeApiError
 from cognee.shared.logging_utils import get_logger, setup_logging
+from cognee.api.health import health_checker, HealthStatus
 from cognee.api.v1.permissions.routers import get_permissions_router
 from cognee.api.v1.settings.routers import get_settings_router
 from cognee.api.v1.datasets.routers import get_datasets_router
@@ -161,11 +162,48 @@ async def root():
 @app.get("/health")
-def health_check():
+async def health_check():
     """
-    Health check endpoint that returns the server status.
+    Health check endpoint for liveness/readiness probes.
     """
-    return Response(status_code=200)
+    try:
+        health_status = await health_checker.get_health_status(detailed=False)
+        status_code = 503 if health_status.status == HealthStatus.UNHEALTHY else 200
+        return JSONResponse(
+            status_code=status_code,
+            content={
+                "status": "ready" if status_code == 200 else "not ready",
+                "health": health_status.status,
+                "version": health_status.version,
+            },
+        )
+    except Exception as e:
+        return JSONResponse(
+            status_code=503,
+            content={"status": "not ready", "reason": f"health check failed: {str(e)}"},
+        )
+@app.get("/health/detailed")
+async def detailed_health_check():
+    """
+    Comprehensive health status with component details.
+    """
+    try:
+        health_status = await health_checker.get_health_status(detailed=True)
+        status_code = 200
+        if health_status.status == HealthStatus.UNHEALTHY:
+            status_code = 503
+        elif health_status.status == HealthStatus.DEGRADED:
+            status_code = 200  # Degraded is still operational
+        return JSONResponse(status_code=status_code, content=health_status.model_dump())
+    except Exception as e:
+        return JSONResponse(
+            status_code=503,
+            content={"status": "unhealthy", "error": f"Health check system failure: {str(e)}"},
+        )
 app.include_router(get_auth_router(), prefix="/api/v1/auth", tags=["auth"])

cognee/api/health.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""Health check system for cognee API."""
+import time
+import asyncio
+from datetime import datetime, timezone
+from typing import Dict, Any, Optional
+from enum import Enum
+from pydantic import BaseModel
+from cognee.version import get_cognee_version
+from cognee.shared.logging_utils import get_logger
+logger = get_logger()
+class HealthStatus(str, Enum):
+    HEALTHY = "healthy"
+    DEGRADED = "degraded"
+    UNHEALTHY = "unhealthy"
+class ComponentHealth(BaseModel):
+    status: HealthStatus
+    provider: str
+    response_time_ms: int
+    details: str
+class HealthResponse(BaseModel):
+    status: HealthStatus
+    timestamp: str
+    version: str
+    uptime: int
+    components: Dict[str, ComponentHealth]
+class HealthChecker:
+    def __init__(self):
+        self.start_time = time.time()
+    async def check_relational_db(self) -> ComponentHealth:
+        """Check relational database health."""
+        start_time = time.time()
+        try:
+            from cognee.infrastructure.databases.relational.get_relational_engine import (
+                get_relational_engine,
+            )
+            from cognee.infrastructure.databases.relational.config import get_relational_config
+            config = get_relational_config()
+            engine = get_relational_engine()
+            # Test connection by creating a session
+            session = engine.get_session()
+            if session:
+                await session.close()
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.HEALTHY,
+                provider=config.db_provider,
+                response_time_ms=response_time,
+                details="Connection successful",
+            )
+        except Exception as e:
+            response_time = int((time.time() - start_time) * 1000)
+            logger.error(f"Relational DB health check failed: {str(e)}", exc_info=True)
+            return ComponentHealth(
+                status=HealthStatus.UNHEALTHY,
+                provider="unknown",
+                response_time_ms=response_time,
+                details=f"Connection failed: {str(e)}",
+            )
+    async def check_vector_db(self) -> ComponentHealth:
+        """Check vector database health."""
+        start_time = time.time()
+        try:
+            from cognee.infrastructure.databases.vector.get_vector_engine import get_vector_engine
+            from cognee.infrastructure.databases.vector.config import get_vectordb_config
+            config = get_vectordb_config()
+            engine = get_vector_engine()
+            # Test basic operation - just check if engine is accessible
+            if hasattr(engine, "health_check"):
+                await engine.health_check()
+            elif hasattr(engine, "list_tables"):
+                # For LanceDB and similar
+                engine.list_tables()
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.HEALTHY,
+                provider=config.vector_db_provider,
+                response_time_ms=response_time,
+                details="Index accessible",
+            )
+        except Exception as e:
+            response_time = int((time.time() - start_time) * 1000)
+            logger.error(f"Vector DB health check failed: {str(e)}", exc_info=True)
+            return ComponentHealth(
+                status=HealthStatus.UNHEALTHY,
+                provider="unknown",
+                response_time_ms=response_time,
+                details=f"Connection failed: {str(e)}",
+            )
+    async def check_graph_db(self) -> ComponentHealth:
+        """Check graph database health."""
+        start_time = time.time()
+        try:
+            from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
+            from cognee.infrastructure.databases.graph.config import get_graph_config
+            config = get_graph_config()
+            engine = await get_graph_engine()
+            # Test basic operation with actual graph query
+            if hasattr(engine, "execute"):
+                # For SQL-like graph DBs (Neo4j, Memgraph)
+                await engine.execute("MATCH () RETURN count(*) LIMIT 1")
+            elif hasattr(engine, "query"):
+                # For other graph engines
+                engine.query("MATCH () RETURN count(*) LIMIT 1", {})
+            # If engine exists but no test method, consider it healthy
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.HEALTHY,
+                provider=config.graph_database_provider,
+                response_time_ms=response_time,
+                details="Schema validated",
+            )
+        except Exception as e:
+            response_time = int((time.time() - start_time) * 1000)
+            logger.error(f"Graph DB health check failed: {str(e)}", exc_info=True)
+            return ComponentHealth(
+                status=HealthStatus.UNHEALTHY,
+                provider="unknown",
+                response_time_ms=response_time,
+                details=f"Connection failed: {str(e)}",
+            )
+    async def check_file_storage(self) -> ComponentHealth:
+        """Check file storage health."""
+        start_time = time.time()
+        try:
+            import os
+            from cognee.infrastructure.files.storage.get_file_storage import get_file_storage
+            from cognee.base_config import get_base_config
+            base_config = get_base_config()
+            storage = get_file_storage(base_config.data_root_directory)
+            # Determine provider
+            provider = "s3" if base_config.data_root_directory.startswith("s3://") else "local"
+            # Test storage accessibility - for local storage, just check directory exists
+            if provider == "local":
+                os.makedirs(base_config.data_root_directory, exist_ok=True)
+                # Simple write/read test
+                test_file = os.path.join(base_config.data_root_directory, "health_check_test")
+                with open(test_file, "w") as f:
+                    f.write("test")
+                os.remove(test_file)
+            else:
+                # For S3, test basic operations
+                test_path = "health_check_test"
+                await storage.store(test_path, b"test")
+                await storage.delete(test_path)
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.HEALTHY,
+                provider=provider,
+                response_time_ms=response_time,
+                details="Storage accessible",
+            )
+        except Exception as e:
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.UNHEALTHY,
+                provider="unknown",
+                response_time_ms=response_time,
+                details=f"Storage test failed: {str(e)}",
+            )
+    async def check_llm_provider(self) -> ComponentHealth:
+        """Check LLM provider health (non-critical)."""
+        start_time = time.time()
+        try:
+            from cognee.infrastructure.llm.get_llm_client import get_llm_client
+            from cognee.infrastructure.llm.config import get_llm_config
+            config = get_llm_config()
+            # Test actual API connection with minimal request
+            client = get_llm_client()
+            await client.show_prompt("test", "test")
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.HEALTHY,
+                provider=config.llm_provider,
+                response_time_ms=response_time,
+                details="API responding",
+            )
+        except Exception as e:
+            response_time = int((time.time() - start_time) * 1000)
+            logger.error(f"LLM provider health check failed: {str(e)}", exc_info=True)
+            return ComponentHealth(
+                status=HealthStatus.DEGRADED,
+                provider="unknown",
+                response_time_ms=response_time,
+                details=f"API check failed: {str(e)}",
+            )
+    async def check_embedding_service(self) -> ComponentHealth:
+        """Check embedding service health (non-critical)."""
+        start_time = time.time()
+        try:
+            from cognee.infrastructure.databases.vector.embeddings.get_embedding_engine import (
+                get_embedding_engine,
+            )
+            # Test actual embedding generation with minimal text
+            engine = get_embedding_engine()
+            await engine.embed_text("test")
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.HEALTHY,
+                provider="configured",
+                response_time_ms=response_time,
+                details="Embedding generation working",
+            )
+        except Exception as e:
+            response_time = int((time.time() - start_time) * 1000)
+            return ComponentHealth(
+                status=HealthStatus.DEGRADED,
+                provider="unknown",
+                response_time_ms=response_time,
+                details=f"Embedding test failed: {str(e)}",
+            )
+    async def get_health_status(self, detailed: bool = False) -> HealthResponse:
+        """Get comprehensive health status."""
+        components = {}
+        # Critical services
+        critical_components = [
+            "relational_db",
+            "vector_db",
+            "graph_db",
+            "file_storage",
+            "llm_provider",
+            "embedding_service",
+        ]
+        critical_checks = [
+            ("relational_db", self.check_relational_db()),
+            ("vector_db", self.check_vector_db()),
+            ("graph_db", self.check_graph_db()),
+            ("file_storage", self.check_file_storage()),
+            ("llm_provider", self.check_llm_provider()),
+            ("embedding_service", self.check_embedding_service()),
+        ]
+        # Non-critical services (only for detailed checks)
+        non_critical_checks = []
+        # Run critical checks
+        critical_results = await asyncio.gather(
+            *[check for _, check in critical_checks], return_exceptions=True
+        )
+        for (name, _), result in zip(critical_checks, critical_results):
+            if isinstance(result, Exception):
+                components[name] = ComponentHealth(
+                    status=HealthStatus.UNHEALTHY,
+                    provider="unknown",
+                    response_time_ms=0,
+                    details=f"Health check failed: {str(result)}",
+                )
+            else:
+                components[name] = result
+        # Run non-critical checks if detailed (currently none)
+        if detailed and non_critical_checks:
+            non_critical_results = await asyncio.gather(
+                *[check for _, check in non_critical_checks], return_exceptions=True
+            )
+            for (name, _), result in zip(non_critical_checks, non_critical_results):
+                if isinstance(result, Exception):
+                    components[name] = ComponentHealth(
+                        status=HealthStatus.DEGRADED,
+                        provider="unknown",
+                        response_time_ms=0,
+                        details=f"Health check failed: {str(result)}",
+                    )
+                else:
+                    components[name] = result
+        # Determine overall status
+        critical_unhealthy = any(
+            comp.status == HealthStatus.UNHEALTHY
+            for name, comp in components.items()
+            if name in critical_components
+        )
+        has_degraded = any(comp.status == HealthStatus.DEGRADED for comp in components.values())
+        if critical_unhealthy:
+            overall_status = HealthStatus.UNHEALTHY
+        elif has_degraded:
+            overall_status = HealthStatus.DEGRADED
+        else:
+            overall_status = HealthStatus.HEALTHY
+        return HealthResponse(
+            status=overall_status,
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            version=get_cognee_version(),
+            uptime=int(time.time() - self.start_time),
+            components=components,
+        )
+# Global health checker instance
+health_checker = HealthChecker()

cognee/api/v1/add/add.py CHANGED Viewed

@@ -15,6 +15,8 @@ async def add(
     vector_db_config: dict = None,
     graph_db_config: dict = None,
     dataset_id: Optional[UUID] = None,
+    preferred_loaders: List[str] = None,
+    incremental_loading: bool = True,
 ):
     """
     Add data to Cognee for knowledge graph processing.
@@ -129,7 +131,7 @@ async def add(
         - LLM_MODEL: Model name (default: "gpt-4o-mini")
         - DEFAULT_USER_EMAIL: Custom default user email
         - DEFAULT_USER_PASSWORD: Custom default user password
-        - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate"
+        - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "pgvector"
         - GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx"
     Raises:
@@ -140,7 +142,7 @@ async def add(
     """
     tasks = [
         Task(resolve_data_directories, include_subdirectories=True),
-        Task(ingest_data, dataset_name, user, node_set, dataset_id),
+        Task(ingest_data, dataset_name, user, node_set, dataset_id, preferred_loaders),
     ]
     pipeline_run_info = None
@@ -153,6 +155,7 @@ async def add(
         pipeline_name="add_pipeline",
         vector_db_config=vector_db_config,
         graph_db_config=graph_db_config,
+        incremental_loading=incremental_loading,
     ):
         pipeline_run_info = run_info

cognee/api/v1/add/routers/get_add_router.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import List, Optional, Union, Literal
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_authenticated_user
 from cognee.shared.utils import send_telemetry
+from cognee.modules.pipelines.models import PipelineRunErrored
 from cognee.shared.logging_utils import get_logger
 logger = get_logger()
@@ -100,6 +101,8 @@ def get_add_router() -> APIRouter:
             else:
                 add_run = await cognee_add(data, datasetName, user=user, dataset_id=datasetId)
+                if isinstance(add_run, PipelineRunErrored):
+                    return JSONResponse(status_code=420, content=add_run.model_dump(mode="json"))
                 return add_run.model_dump()
         except Exception as error:
             return JSONResponse(status_code=409, content={"error": str(error)})

cognee/api/v1/cognify/code_graph_pipeline.py CHANGED Viewed

@@ -79,7 +79,9 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
         async for run_status in non_code_pipeline_run:
             yield run_status
-    async for run_status in run_tasks(tasks, dataset.id, repo_path, user, "cognify_code_pipeline"):
+    async for run_status in run_tasks(
+        tasks, dataset.id, repo_path, user, "cognify_code_pipeline", incremental_loading=False
+    ):
         yield run_status

cognee/api/v1/cognify/cognify.py CHANGED Viewed

@@ -39,6 +39,7 @@ async def cognify(
     vector_db_config: dict = None,
     graph_db_config: dict = None,
     run_in_background: bool = False,
+    incremental_loading: bool = True,
 ):
     """
     Transform ingested data into a structured knowledge graph.
@@ -194,6 +195,7 @@ async def cognify(
             datasets=datasets,
             vector_db_config=vector_db_config,
             graph_db_config=graph_db_config,
+            incremental_loading=incremental_loading,
         )
     else:
         return await run_cognify_blocking(
@@ -202,6 +204,7 @@ async def cognify(
             datasets=datasets,
             vector_db_config=vector_db_config,
             graph_db_config=graph_db_config,
+            incremental_loading=incremental_loading,
         )
@@ -211,6 +214,7 @@ async def run_cognify_blocking(
     datasets,
     graph_db_config: dict = None,
     vector_db_config: dict = False,
+    incremental_loading: bool = True,
 ):
     total_run_info = {}
@@ -221,6 +225,7 @@ async def run_cognify_blocking(
         pipeline_name="cognify_pipeline",
         graph_db_config=graph_db_config,
         vector_db_config=vector_db_config,
+        incremental_loading=incremental_loading,
     ):
         if run_info.dataset_id:
             total_run_info[run_info.dataset_id] = run_info
@@ -236,6 +241,7 @@ async def run_cognify_as_background_process(
     datasets,
     graph_db_config: dict = None,
     vector_db_config: dict = False,
+    incremental_loading: bool = True,
 ):
     # Convert dataset to list if it's a string
     if isinstance(datasets, str):
@@ -246,6 +252,7 @@ async def run_cognify_as_background_process(
     async def handle_rest_of_the_run(pipeline_list):
         # Execute all provided pipelines one by one to avoid database write conflicts
+        # TODO: Convert to async gather task instead of for loop when Queue mechanism for database is created
         for pipeline in pipeline_list:
             while True:
                 try:
@@ -270,6 +277,7 @@ async def run_cognify_as_background_process(
             pipeline_name="cognify_pipeline",
             graph_db_config=graph_db_config,
             vector_db_config=vector_db_config,
+            incremental_loading=incremental_loading,
         )
         # Save dataset Pipeline run started info

cognee/api/v1/cognify/routers/get_cognify_router.py CHANGED Viewed

@@ -16,7 +16,11 @@ from cognee.modules.graph.methods import get_formatted_graph_data
 from cognee.modules.users.get_user_manager import get_user_manager_context
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.users.authentication.default.default_jwt_strategy import DefaultJWTStrategy
-from cognee.modules.pipelines.models.PipelineRunInfo import PipelineRunCompleted, PipelineRunInfo
+from cognee.modules.pipelines.models.PipelineRunInfo import (
+    PipelineRunCompleted,
+    PipelineRunInfo,
+    PipelineRunErrored,
+)
 from cognee.modules.pipelines.queues.pipeline_run_info_queues import (
     get_from_queue,
     initialize_queue,
@@ -105,6 +109,9 @@ def get_cognify_router() -> APIRouter:
                 datasets, user, run_in_background=payload.run_in_background
             )
+            # If any cognify run errored return JSONResponse with proper error status code
+            if any(isinstance(v, PipelineRunErrored) for v in cognify_run.values()):
+                return JSONResponse(status_code=420, content=cognify_run)
             return cognify_run
         except Exception as error:
             return JSONResponse(status_code=409, content={"error": str(error)})

cognee/api/v1/config/config.py CHANGED Viewed

@@ -7,7 +7,9 @@ from cognee.modules.cognify.config import get_cognify_config
 from cognee.infrastructure.data.chunking.config import get_chunk_config
 from cognee.infrastructure.databases.vector import get_vectordb_config
 from cognee.infrastructure.databases.graph.config import get_graph_config
-from cognee.infrastructure.llm.config import get_llm_config
+from cognee.infrastructure.llm.config import (
+    get_llm_config,
+)
 from cognee.infrastructure.databases.relational import get_relational_config, get_migration_config

cognee/api/v1/datasets/routers/get_datasets_router.py CHANGED Viewed

@@ -283,14 +283,8 @@ def get_datasets_router() -> APIRouter:
         - **404 Not Found**: Dataset doesn't exist or user doesn't have access
         - **500 Internal Server Error**: Error retrieving graph data
         """
-        from cognee.modules.data.methods import get_dataset
-        dataset = await get_dataset(user.id, dataset_id)
-        if dataset is None:
-            raise DatasetNotFoundError(message=f"Dataset ({str(dataset_id)}) not found.")
-        graph_data = await get_formatted_graph_data(dataset.id, user.id)
+        graph_data = await get_formatted_graph_data(dataset_id, user.id)
         return graph_data

cognee/api/v1/delete/delete.py CHANGED Viewed

@@ -16,7 +16,11 @@ from cognee.modules.users.methods import get_default_user
 from cognee.modules.data.methods import get_authorized_existing_datasets
 from cognee.context_global_variables import set_database_global_context_variables
-from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
+from cognee.api.v1.delete.exceptions import (
+    DocumentNotFoundError,
+    DatasetNotFoundError,
+    DocumentSubgraphNotFoundError,
+)
 logger = get_logger()
@@ -82,17 +86,17 @@ async def delete(
             raise DocumentNotFoundError(f"Data {data_id} not found in dataset {dataset_id}")
         # Get the content hash for deletion
-        content_hash = data_point.content_hash
+        data_id = str(data_point.id)
     # Use the existing comprehensive deletion logic
-    return await delete_single_document(content_hash, dataset.id, mode)
+    return await delete_single_document(data_id, dataset.id, mode)
-async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
+async def delete_single_document(data_id: str, dataset_id: UUID = None, mode: str = "soft"):
     """Delete a single document by its content hash."""
     # Delete from graph database
-    deletion_result = await delete_document_subgraph(content_hash, mode)
+    deletion_result = await delete_document_subgraph(data_id, mode)
     logger.info(f"Deletion result: {deletion_result}")
@@ -163,12 +167,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
         # Get the data point
         data_point = (
-            await session.execute(select(Data).filter(Data.content_hash == content_hash))
+            await session.execute(select(Data).filter(Data.id == UUID(data_id)))
         ).scalar_one_or_none()
         if data_point is None:
             raise DocumentNotFoundError(
-                f"Document not found in relational DB with content hash: {content_hash}"
+                f"Document not found in relational DB with data id: {data_id}"
             )
         doc_id = data_point.id
@@ -203,7 +207,7 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
         "status": "success",
         "message": "Document deleted from both graph and relational databases",
         "graph_deletions": deletion_result["deleted_counts"],
-        "content_hash": content_hash,
+        "data_id": data_id,
         "dataset": dataset_id,
         "deleted_node_ids": [
             str(node_id) for node_id in deleted_node_ids
@@ -211,12 +215,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
     }
-async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
+async def delete_document_subgraph(document_id: str, mode: str = "soft"):
     """Delete a document and all its related nodes in the correct order."""
     graph_db = await get_graph_engine()
-    subgraph = await graph_db.get_document_subgraph(content_hash)
+    subgraph = await graph_db.get_document_subgraph(document_id)
     if not subgraph:
-        raise DocumentSubgraphNotFoundError(f"Document not found with content hash: {content_hash}")
+        raise DocumentSubgraphNotFoundError(f"Document not found with id: {document_id}")
     # Delete in the correct order to maintain graph integrity
     deletion_order = [
@@ -260,6 +264,6 @@ async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
     return {
         "status": "success",
         "deleted_counts": deleted_counts,
-        "content_hash": content_hash,
+        "document_id": document_id,
         "deleted_node_ids": deleted_node_ids,
     }

cognee/api/v1/responses/routers/get_responses_router.py CHANGED Viewed

@@ -17,7 +17,9 @@ from cognee.api.v1.responses.models import (
 )
 from cognee.api.v1.responses.dispatch_function import dispatch_function
 from cognee.api.v1.responses.default_tools import DEFAULT_TOOLS
-from cognee.infrastructure.llm.config import get_llm_config
+from cognee.infrastructure.llm.config import (
+    get_llm_config,
+)
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_authenticated_user

cognee 0.2.2.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

cognee 0.2.2.dev0py3-none-any.whl → 0.2.3py3-none-any.whl