PyPI - cognee - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0.dev0__py3-none-any.whl - Mend

cognee 0.2.4py3-none-any.whl → 0.3.0.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

cognee/modules/memify/memify.py ADDED Viewed

@@ -0,0 +1,118 @@
+from typing import Union, Optional, List, Type, Any
+from uuid import UUID
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.retrieval.utils.brute_force_triplet_search import get_memory_fragment
+from cognee.context_global_variables import set_database_global_context_variables
+from cognee.modules.engine.models.node_set import NodeSet
+from cognee.modules.pipelines import run_pipeline
+from cognee.modules.pipelines.tasks.task import Task
+from cognee.modules.users.models import User
+from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
+    resolve_authorized_user_datasets,
+)
+from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
+    reset_dataset_pipeline_run_status,
+)
+from cognee.modules.engine.operations.setup import setup
+from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor
+from cognee.tasks.memify.extract_subgraph_chunks import extract_subgraph_chunks
+from cognee.tasks.codingagents.coding_rule_associations import (
+    add_rule_associations,
+)
+logger = get_logger("memify")
+async def memify(
+    extraction_tasks: Union[List[Task], List[str]] = None,
+    enrichment_tasks: Union[List[Task], List[str]] = None,
+    data: Optional[Any] = None,
+    dataset: Union[str, UUID] = "main_dataset",
+    user: User = None,
+    node_type: Optional[Type] = NodeSet,
+    node_name: Optional[List[str]] = None,
+    vector_db_config: Optional[dict] = None,
+    graph_db_config: Optional[dict] = None,
+    run_in_background: bool = False,
+):
+    """
+    Enrichment pipeline in Cognee, can work with already built graphs. If no data is provided existing knowledge graph will be used as data,
+    custom data can also be provided instead which can be processed with provided extraction and enrichment tasks.
+    Provided tasks and data will be arranged to run the Cognee pipeline and execute graph enrichment/creation.
+    This is the core processing step in Cognee that converts raw text and documents
+    into an intelligent knowledge graph. It analyzes content, extracts entities and
+    relationships, and creates semantic connections for enhanced search and reasoning.
+    Args:
+        extraction_tasks: List of Cognee Tasks to execute for graph/data extraction.
+        enrichment_tasks: List of Cognee Tasks to handle enrichment of provided graph/data from extraction tasks.
+        data: The data to ingest. Can be anything when custom extraction and enrichment tasks are used.
+              Data provided here will be forwarded to the first extraction task in the pipeline as input.
+              If no data is provided the whole graph (or subgraph if node_name/node_type is specified) will be forwarded
+        dataset: Dataset name or dataset uuid to process.
+        user: User context for authentication and data access. Uses default if None.
+        node_type: Filter graph to specific entity types (for advanced filtering). Used when no data is provided.
+        node_name: Filter graph to specific named entities (for targeted search). Used when no data is provided.
+        vector_db_config: Custom vector database configuration for embeddings storage.
+        graph_db_config: Custom graph database configuration for relationship storage.
+        run_in_background: If True, starts processing asynchronously and returns immediately.
+                          If False, waits for completion before returning.
+                          Background mode recommended for large datasets (>100MB).
+                          Use pipeline_run_id from return value to monitor progress.
+    """
+    # Use default coding rules tasks if no tasks were provided
+    if not extraction_tasks:
+        extraction_tasks = [Task(extract_subgraph_chunks)]
+    if not enrichment_tasks:
+        enrichment_tasks = [
+            Task(
+                add_rule_associations,
+                rules_nodeset_name="coding_agent_rules",
+                task_config={"batch_size": 1},
+            )
+        ]
+    await setup()
+    user, authorized_dataset_list = await resolve_authorized_user_datasets(dataset, user)
+    authorized_dataset = authorized_dataset_list[0]
+    if not data:
+        # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
+        await set_database_global_context_variables(
+            authorized_dataset.id, authorized_dataset.owner_id
+        )
+        memory_fragment = await get_memory_fragment(node_type=node_type, node_name=node_name)
+        # Subgraphs should be a single element in the list to represent one data item
+        data = [memory_fragment]
+    memify_tasks = [
+        *extraction_tasks,  # Unpack tasks provided to memify pipeline
+        *enrichment_tasks,
+    ]
+    await reset_dataset_pipeline_run_status(
+        authorized_dataset.id, user, pipeline_names=["memify_pipeline"]
+    )
+    # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for
+    pipeline_executor_func = get_pipeline_executor(run_in_background=run_in_background)
+    # Run the run_pipeline in the background or blocking based on executor
+    return await pipeline_executor_func(
+        pipeline=run_pipeline,
+        tasks=memify_tasks,
+        user=user,
+        data=data,
+        datasets=authorized_dataset.id,
+        vector_db_config=vector_db_config,
+        graph_db_config=graph_db_config,
+        incremental_loading=False,
+        pipeline_name="memify_pipeline",
+    )

cognee/modules/notebooks/methods/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .get_notebook import get_notebook
+from .get_notebooks import get_notebooks
+from .create_notebook import create_notebook
+from .update_notebook import update_notebook
+from .delete_notebook import delete_notebook

cognee/modules/notebooks/methods/create_notebook.py ADDED Viewed

@@ -0,0 +1,26 @@
+from uuid import UUID
+from typing import List, Optional
+from sqlalchemy.ext.asyncio import AsyncSession
+from cognee.infrastructure.databases.relational import with_async_session
+from ..models.Notebook import Notebook, NotebookCell
+@with_async_session
+async def create_notebook(
+    user_id: UUID,
+    notebook_name: str,
+    cells: Optional[List[NotebookCell]],
+    deletable: Optional[bool],
+    session: AsyncSession,
+) -> Notebook:
+    notebook = Notebook(
+        name=notebook_name, owner_id=user_id, cells=cells, deletable=deletable or True
+    )
+    session.add(notebook)
+    await session.commit()
+    return notebook

cognee/modules/notebooks/methods/delete_notebook.py ADDED Viewed

@@ -0,0 +1,13 @@
+from sqlalchemy.ext.asyncio import AsyncSession
+from cognee.infrastructure.databases.relational import with_async_session
+from ..models.Notebook import Notebook
+@with_async_session
+async def delete_notebook(
+    notebook: Notebook,
+    session: AsyncSession,
+) -> None:
+    await session.delete(notebook)

cognee/modules/notebooks/methods/get_notebook.py ADDED Viewed

@@ -0,0 +1,21 @@
+from uuid import UUID
+from typing import Optional
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+from cognee.infrastructure.databases.relational import with_async_session
+from ..models.Notebook import Notebook
+@with_async_session
+async def get_notebook(
+    notebook_id: UUID,
+    user_id: UUID,
+    session: AsyncSession,
+) -> Optional[Notebook]:
+    result = await session.execute(
+        select(Notebook).where(Notebook.owner_id == user_id and Notebook.id == notebook_id)
+    )
+    return result.scalar()

cognee/modules/notebooks/methods/get_notebooks.py ADDED Viewed

@@ -0,0 +1,18 @@
+from uuid import UUID
+from typing import List
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+from cognee.infrastructure.databases.relational import with_async_session
+from ..models.Notebook import Notebook
+@with_async_session
+async def get_notebooks(
+    user_id: UUID,
+    session: AsyncSession,
+) -> List[Notebook]:
+    result = await session.execute(select(Notebook).where(Notebook.owner_id == user_id))
+    return list(result.scalars().all())

cognee/modules/notebooks/methods/update_notebook.py ADDED Viewed

@@ -0,0 +1,17 @@
+from typing import Callable, AsyncContextManager
+from sqlalchemy.ext.asyncio import AsyncSession
+from cognee.infrastructure.databases.relational import with_async_session
+from ..models.Notebook import Notebook
+@with_async_session
+async def update_notebook(
+    notebook: Notebook,
+    session: AsyncSession,
+) -> Notebook:
+    if notebook not in session:
+        session.add(notebook)
+    return notebook

cognee/modules/notebooks/models/Notebook.py ADDED Viewed

@@ -0,0 +1,53 @@
+import json
+from typing import List, Literal
+from uuid import uuid4, UUID as UUID_t
+from pydantic import BaseModel, ConfigDict
+from datetime import datetime, timezone
+from fastapi.encoders import jsonable_encoder
+from sqlalchemy import Boolean, Column, DateTime, JSON, UUID, String, TypeDecorator
+from sqlalchemy.orm import mapped_column, Mapped
+from cognee.infrastructure.databases.relational import Base
+class NotebookCell(BaseModel):
+    id: UUID_t
+    type: Literal["markdown", "code"]
+    name: str
+    content: str
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+class NotebookCellList(TypeDecorator):
+    impl = JSON
+    cache_ok = True
+    def process_bind_param(self, notebook_cells, dialect):
+        if notebook_cells is None:
+            return []
+        return [
+            json.dumps(jsonable_encoder(cell)) if isinstance(cell, NotebookCell) else cell
+            for cell in notebook_cells
+        ]
+    def process_result_value(self, cells_json_list, dialect):
+        if cells_json_list is None:
+            return []
+        return [NotebookCell(**json.loads(json_string)) for json_string in cells_json_list]
+class Notebook(Base):
+    __tablename__ = "notebooks"
+    id: Mapped[UUID_t] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid4)
+    owner_id: Mapped[UUID_t] = mapped_column(UUID(as_uuid=True), index=True)
+    name: Mapped[str] = mapped_column(String, nullable=False)
+    cells: Mapped[List[NotebookCell]] = mapped_column(NotebookCellList, nullable=False)
+    deletable: Mapped[bool] = mapped_column(Boolean, default=True)
+    created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))

cognee/modules/notebooks/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .Notebook import Notebook, NotebookCell

cognee/modules/notebooks/operations/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .run_in_local_sandbox import run_in_local_sandbox

cognee/modules/notebooks/operations/run_in_local_sandbox.py ADDED Viewed

@@ -0,0 +1,55 @@
+import io
+import sys
+import traceback
+def wrap_in_async_handler(user_code: str) -> str:
+    return (
+        "from cognee.infrastructure.utils.run_sync import run_sync\n\n"
+        "async def __user_main__():\n"
+        + "\n".join("    " + line for line in user_code.strip().split("\n"))
+        + "\n"
+        "    globals().update(locals())\n\n"
+        "run_sync(__user_main__())\n"
+    )
+def run_in_local_sandbox(code, environment=None):
+    environment = environment or {}
+    code = wrap_in_async_handler(code.replace("\xa0", "\n"))
+    buffer = io.StringIO()
+    sys_stdout = sys.stdout
+    sys.stdout = buffer
+    sys.stderr = buffer
+    error = None
+    printOutput = []
+    def customPrintFunction(output):
+        printOutput.append(output)
+    environment["print"] = customPrintFunction
+    try:
+        exec(code, environment)
+    except Exception:
+        error = traceback.format_exc()
+    finally:
+        sys.stdout = sys_stdout
+        sys.stderr = sys_stdout
+    return printOutput, error
+if __name__ == "__main__":
+    run_in_local_sandbox("""
+import cognee
+await cognee.add("Test file with some random content 3.")
+a = "asd"
+b = {"c": "dfgh"}
+""")

cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py CHANGED Viewed

@@ -1,12 +1,28 @@
 from uuid import UUID
+from typing import Optional, List
 from cognee.modules.pipelines.methods import get_pipeline_runs_by_dataset, reset_pipeline_run_status
 from cognee.modules.pipelines.models.PipelineRun import PipelineRunStatus
 from cognee.modules.users.models import User
-async def reset_dataset_pipeline_run_status(dataset_id: UUID, user: User):
+async def reset_dataset_pipeline_run_status(
+    dataset_id: UUID, user: User, pipeline_names: Optional[list[str]] = None
+):
+    """Reset the status of all (or selected) pipeline runs for a dataset.
+    If *pipeline_names* is given, only runs whose *pipeline_name* is in
+    that list are touched.
+    """
     related_pipeline_runs = await get_pipeline_runs_by_dataset(dataset_id)
     for pipeline_run in related_pipeline_runs:
-        if pipeline_run.status is not PipelineRunStatus.DATASET_PROCESSING_INITIATED:
-            await reset_pipeline_run_status(user.id, dataset_id, pipeline_run.pipeline_name)
+        # Skip runs that are initiated
+        if pipeline_run.status is PipelineRunStatus.DATASET_PROCESSING_INITIATED:
+            continue
+        # If a name filter is provided, skip non-matching runs
+        if pipeline_names is not None and pipeline_run.pipeline_name not in pipeline_names:
+            continue
+        await reset_pipeline_run_status(user.id, dataset_id, pipeline_run.pipeline_name)

cognee/modules/pipelines/operations/pipeline.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Union
 from cognee.modules.pipelines.layers.setup_and_check_environment import (
     setup_and_check_environment,
 )
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
 from cognee.modules.data.models import Data, Dataset

cognee/modules/pipelines/operations/run_tasks.py CHANGED Viewed

@@ -266,48 +266,24 @@ async def run_tasks(
         if incremental_loading:
             data = await resolve_data_directories(data)
-        # TODO: Return to using async.gather for data items after Cognee release
-        # # Create async tasks per data item that will run the pipeline for the data item
-        # data_item_tasks = [
-        #     asyncio.create_task(
-        #         _run_tasks_data_item(
-        #             data_item,
-        #             dataset,
-        #             tasks,
-        #             pipeline_name,
-        #             pipeline_id,
-        #             pipeline_run_id,
-        #             context,
-        #             user,
-        #             incremental_loading,
-        #         )
-        #     )
-        #     for data_item in data
-        # ]
-        # results = await asyncio.gather(*data_item_tasks)
-        # # Remove skipped data items from results
-        # results = [result for result in results if result]
-        ### TEMP sync data item handling
-        results = []
-        # Run the pipeline for each data_item sequentially, one after the other
-        for data_item in data:
-            result = await _run_tasks_data_item(
-                data_item,
-                dataset,
-                tasks,
-                pipeline_name,
-                pipeline_id,
-                pipeline_run_id,
-                context,
-                user,
-                incremental_loading,
+        # Create async tasks per data item that will run the pipeline for the data item
+        data_item_tasks = [
+            asyncio.create_task(
+                _run_tasks_data_item(
+                    data_item,
+                    dataset,
+                    tasks,
+                    pipeline_name,
+                    pipeline_id,
+                    pipeline_run_id,
+                    context,
+                    user,
+                    incremental_loading,
+                )
             )
-            # Skip items that returned a false-y value
-            if result:
-                results.append(result)
-        ### END
+            for data_item in data
+        ]
+        results = await asyncio.gather(*data_item_tasks)
         # Remove skipped data items from results
         results = [result for result in results if result]

cognee/modules/retrieval/base_graph_retriever.py ADDED Viewed

@@ -0,0 +1,18 @@
+from typing import List, Optional
+from abc import ABC, abstractmethod
+from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
+class BaseGraphRetriever(ABC):
+    """Base class for all graph based retrievers."""
+    @abstractmethod
+    async def get_context(self, query: str) -> List[Edge]:
+        """Retrieves triplets based on the query."""
+        pass
+    @abstractmethod
+    async def get_completion(self, query: str, context: Optional[List[Edge]] = None) -> str:
+        """Generates a response using the query and optional context (triplets)."""
+        pass

cognee/modules/retrieval/base_retriever.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Optional, Callable
+from typing import Any, Optional
 class BaseRetriever(ABC):

cognee/modules/retrieval/code_retriever.py CHANGED Viewed

@@ -94,7 +94,15 @@ class CodeRetriever(BaseRetriever):
                         {"id": res.id, "score": res.score, "payload": res.payload}
                     )
+            existing_collection = []
             for collection in self.classes_and_functions_collections:
+                if await vector_engine.has_collection(collection):
+                    existing_collection.append(collection)
+            if not existing_collection:
+                raise RuntimeError("No collection found for code retriever")
+            for collection in existing_collection:
                 logger.debug(f"Searching {collection} collection with general query")
                 search_results_code = await vector_engine.search(
                     collection, query, limit=self.top_k

cognee/modules/retrieval/coding_rules_retriever.py ADDED Viewed

@@ -0,0 +1,31 @@
+import asyncio
+from functools import reduce
+from typing import List, Optional
+from cognee.shared.logging_utils import get_logger
+from cognee.tasks.codingagents.coding_rule_associations import get_existing_rules
+logger = get_logger("CodingRulesRetriever")
+class CodingRulesRetriever:
+    """Retriever for handling codeing rule based searches."""
+    def __init__(self, rules_nodeset_name: Optional[List[str]] = None):
+        if isinstance(rules_nodeset_name, list):
+            if not rules_nodeset_name:
+                # If there is no provided nodeset set to coding_agent_rules
+                rules_nodeset_name = ["coding_agent_rules"]
+        self.rules_nodeset_name = rules_nodeset_name
+        """Initialize retriever with search parameters."""
+    async def get_existing_rules(self, query_text):
+        if self.rules_nodeset_name:
+            rules_list = await asyncio.gather(
+                *[
+                    get_existing_rules(rules_nodeset_name=nodeset)
+                    for nodeset in self.rules_nodeset_name
+                ]
+            )
+            return reduce(lambda x, y: x + y, rules_list, [])

cognee/modules/retrieval/completion_retriever.py CHANGED Viewed

@@ -23,12 +23,14 @@ class CompletionRetriever(BaseRetriever):
         self,
         user_prompt_path: str = "context_for_question.txt",
         system_prompt_path: str = "answer_simple_question.txt",
+        system_prompt: Optional[str] = None,
         top_k: Optional[int] = 1,
     ):
         """Initialize retriever with optional custom prompt paths."""
         self.user_prompt_path = user_prompt_path
         self.system_prompt_path = system_prompt_path
         self.top_k = top_k if top_k is not None else 1
+        self.system_prompt = system_prompt
     async def get_context(self, query: str) -> str:
         """
@@ -65,7 +67,7 @@ class CompletionRetriever(BaseRetriever):
             logger.error("DocumentChunk_text collection not found")
             raise NoDataError("No data found in the system, please add data first.") from error
-    async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
+    async def get_completion(self, query: str, context: Optional[Any] = None) -> str:
         """
         Generates an LLM completion using the context.
@@ -88,6 +90,10 @@ class CompletionRetriever(BaseRetriever):
             context = await self.get_context(query)
         completion = await generate_completion(
-            query, context, self.user_prompt_path, self.system_prompt_path
+            query=query,
+            context=context,
+            user_prompt_path=self.user_prompt_path,
+            system_prompt_path=self.system_prompt_path,
+            system_prompt=self.system_prompt,
         )
-        return [completion]
+        return completion

cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py CHANGED Viewed

@@ -49,6 +49,7 @@ class TripletSearchContextProvider(BaseContextProvider):
         tasks = [
             brute_force_triplet_search(
                 query=f"{entity_text} {query}",
+                user=user,
                 top_k=self.top_k,
                 collections=self.collections,
                 properties_to_project=self.properties_to_project,

cognee/modules/retrieval/graph_completion_context_extension_retriever.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import Any, Optional, List, Type
+from typing import Optional, List, Type
+from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
 from cognee.modules.retrieval.utils.completion import generate_completion
@@ -26,6 +27,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
         self,
         user_prompt_path: str = "graph_context_for_question.txt",
         system_prompt_path: str = "answer_simple_question.txt",
+        system_prompt: Optional[str] = None,
         top_k: Optional[int] = 5,
         node_type: Optional[Type] = None,
         node_name: Optional[List[str]] = None,
@@ -38,11 +40,15 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
             node_type=node_type,
             node_name=node_name,
             save_interaction=save_interaction,
+            system_prompt=system_prompt,
         )
     async def get_completion(
-        self, query: str, context: Optional[Any] = None, context_extension_rounds=4
-    ) -> List[str]:
+        self,
+        query: str,
+        context: Optional[List[Edge]] = None,
+        context_extension_rounds=4,
+    ) -> str:
         """
         Extends the context for a given query by retrieving related triplets and generating new
         completions based on them.
@@ -67,11 +73,12 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
             - List[str]: A list containing the generated answer based on the query and the
               extended context.
         """
-        triplets = []
+        triplets = context
+        if triplets is None:
+            triplets = await self.get_context(query)
-        if context is None:
-            triplets += await self.get_triplets(query)
-            context = await self.resolve_edges_to_text(triplets)
+        context_text = await self.resolve_edges_to_text(triplets)
         round_idx = 1
@@ -83,14 +90,15 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
             )
             completion = await generate_completion(
                 query=query,
-                context=context,
+                context=context_text,
                 user_prompt_path=self.user_prompt_path,
                 system_prompt_path=self.system_prompt_path,
+                system_prompt=self.system_prompt,
             )
-            triplets += await self.get_triplets(completion)
+            triplets += await self.get_context(completion)
             triplets = list(set(triplets))
-            context = await self.resolve_edges_to_text(triplets)
+            context_text = await self.resolve_edges_to_text(triplets)
             num_triplets = len(triplets)
@@ -109,14 +117,15 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
         completion = await generate_completion(
             query=query,
-            context=context,
+            context=context_text,
             user_prompt_path=self.user_prompt_path,
             system_prompt_path=self.system_prompt_path,
+            system_prompt=self.system_prompt,
         )
-        if self.save_interaction and context and triplets and completion:
+        if self.save_interaction and context_text and triplets and completion:
             await self.save_qa(
-                question=query, answer=completion, context=context, triplets=triplets
+                question=query, answer=completion, context=context_text, triplets=triplets
             )
-        return [completion]
+        return completion

cognee 0.2.4__py3-none-any.whl → 0.3.0.dev0__py3-none-any.whl

cognee 0.2.4py3-none-any.whl → 0.3.0.dev0py3-none-any.whl