PyPI - cognee - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

cognee 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import Optional, List
+from cognee import memify
+from cognee.context_global_variables import (
+    set_database_global_context_variables,
+    set_session_user_context_variable,
+)
+from cognee.exceptions import CogneeValidationError
+from cognee.modules.data.methods import get_authorized_existing_datasets
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.pipelines.tasks.task import Task
+from cognee.modules.users.models import User
+from cognee.tasks.memify import extract_user_sessions, cognify_session
+logger = get_logger("persist_sessions_in_knowledge_graph")
+async def persist_sessions_in_knowledge_graph_pipeline(
+    user: User,
+    session_ids: Optional[List[str]] = None,
+    dataset: str = "main_dataset",
+    run_in_background: bool = False,
+):
+    await set_session_user_context_variable(user)
+    dataset_to_write = await get_authorized_existing_datasets(
+        user=user, datasets=[dataset], permission_type="write"
+    )
+    if not dataset_to_write:
+        raise CogneeValidationError(
+            message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}",
+            log=False,
+        )
+    await set_database_global_context_variables(
+        dataset_to_write[0].id, dataset_to_write[0].owner_id
+    )
+    extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)]
+    enrichment_tasks = [
+        Task(cognify_session, dataset_id=dataset_to_write[0].id),
+    ]
+    result = await memify(
+        extraction_tasks=extraction_tasks,
+        enrichment_tasks=enrichment_tasks,
+        dataset=dataset_to_write[0].id,
+        data=[{}],
+        run_in_background=run_in_background,
+    )
+    logger.info("Session persistence pipeline completed")
+    return result

cognee/modules/chunking/CsvChunker.py ADDED Viewed

@@ -0,0 +1,35 @@
+from cognee.shared.logging_utils import get_logger
+from cognee.tasks.chunks import chunk_by_row
+from cognee.modules.chunking.Chunker import Chunker
+from .models.DocumentChunk import DocumentChunk
+logger = get_logger()
+class CsvChunker(Chunker):
+    async def read(self):
+        async for content_text in self.get_text():
+            if content_text is None:
+                continue
+            for chunk_data in chunk_by_row(content_text, self.max_chunk_size):
+                if chunk_data["chunk_size"] <= self.max_chunk_size:
+                    yield DocumentChunk(
+                        id=chunk_data["chunk_id"],
+                        text=chunk_data["text"],
+                        chunk_size=chunk_data["chunk_size"],
+                        is_part_of=self.document,
+                        chunk_index=self.chunk_index,
+                        cut_type=chunk_data["cut_type"],
+                        contains=[],
+                        metadata={
+                            "index_fields": ["text"],
+                        },
+                    )
+                    self.chunk_index += 1
+                else:
+                    raise ValueError(
+                        f"Chunk size is larger than the maximum chunk size {self.max_chunk_size}"
+                    )

cognee/modules/chunking/models/DocumentChunk.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Union
 from cognee.infrastructure.engine import DataPoint
+from cognee.infrastructure.engine.models.Edge import Edge
 from cognee.modules.data.processing.document_types import Document
 from cognee.modules.engine.models import Entity
 from cognee.tasks.temporal_graph.models import Event
@@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
     chunk_index: int
     cut_type: str
     is_part_of: Document
-    contains: List[Union[Entity, Event]] = None
+    contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None
     metadata: dict = {"index_fields": ["text"]}

cognee/modules/chunking/text_chunker_with_overlap.py ADDED Viewed

@@ -0,0 +1,124 @@
+from cognee.shared.logging_utils import get_logger
+from uuid import NAMESPACE_OID, uuid5
+from cognee.tasks.chunks import chunk_by_paragraph
+from cognee.modules.chunking.Chunker import Chunker
+from .models.DocumentChunk import DocumentChunk
+logger = get_logger()
+class TextChunkerWithOverlap(Chunker):
+    def __init__(
+        self,
+        document,
+        get_text: callable,
+        max_chunk_size: int,
+        chunk_overlap_ratio: float = 0.0,
+        get_chunk_data: callable = None,
+    ):
+        super().__init__(document, get_text, max_chunk_size)
+        self._accumulated_chunk_data = []
+        self._accumulated_size = 0
+        self.chunk_overlap_ratio = chunk_overlap_ratio
+        self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio)
+        if get_chunk_data is not None:
+            self.get_chunk_data = get_chunk_data
+        elif chunk_overlap_ratio > 0:
+            paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size)
+            self.get_chunk_data = lambda text: chunk_by_paragraph(
+                text, paragraph_max_size, batch_paragraphs=True
+            )
+        else:
+            self.get_chunk_data = lambda text: chunk_by_paragraph(
+                text, self.max_chunk_size, batch_paragraphs=True
+            )
+    def _accumulation_overflows(self, chunk_data):
+        """Check if adding chunk_data would exceed max_chunk_size."""
+        return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size
+    def _accumulate_chunk_data(self, chunk_data):
+        """Add chunk_data to the current accumulation."""
+        self._accumulated_chunk_data.append(chunk_data)
+        self._accumulated_size += chunk_data["chunk_size"]
+    def _clear_accumulation(self):
+        """Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio."""
+        if self.chunk_overlap == 0:
+            self._accumulated_chunk_data = []
+            self._accumulated_size = 0
+            return
+        # Keep chunk_data from the end that fit in overlap
+        overlap_chunk_data = []
+        overlap_size = 0
+        for chunk_data in reversed(self._accumulated_chunk_data):
+            if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap:
+                overlap_chunk_data.insert(0, chunk_data)
+                overlap_size += chunk_data["chunk_size"]
+            else:
+                break
+        self._accumulated_chunk_data = overlap_chunk_data
+        self._accumulated_size = overlap_size
+    def _create_chunk(self, text, size, cut_type, chunk_id=None):
+        """Create a DocumentChunk with standard metadata."""
+        try:
+            return DocumentChunk(
+                id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
+                text=text,
+                chunk_size=size,
+                is_part_of=self.document,
+                chunk_index=self.chunk_index,
+                cut_type=cut_type,
+                contains=[],
+                metadata={"index_fields": ["text"]},
+            )
+        except Exception as e:
+            logger.error(e)
+            raise e
+    def _create_chunk_from_accumulation(self):
+        """Create a DocumentChunk from current accumulated chunk_data."""
+        chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data)
+        return self._create_chunk(
+            text=chunk_text,
+            size=self._accumulated_size,
+            cut_type=self._accumulated_chunk_data[-1]["cut_type"],
+        )
+    def _emit_chunk(self, chunk_data):
+        """Emit a chunk when accumulation overflows."""
+        if len(self._accumulated_chunk_data) > 0:
+            chunk = self._create_chunk_from_accumulation()
+            self._clear_accumulation()
+            self._accumulate_chunk_data(chunk_data)
+        else:
+            # Handle single chunk_data exceeding max_chunk_size
+            chunk = self._create_chunk(
+                text=chunk_data["text"],
+                size=chunk_data["chunk_size"],
+                cut_type=chunk_data["cut_type"],
+                chunk_id=chunk_data["chunk_id"],
+            )
+        self.chunk_index += 1
+        return chunk
+    async def read(self):
+        async for content_text in self.get_text():
+            for chunk_data in self.get_chunk_data(content_text):
+                if not self._accumulation_overflows(chunk_data):
+                    self._accumulate_chunk_data(chunk_data)
+                    continue
+                yield self._emit_chunk(chunk_data)
+        if len(self._accumulated_chunk_data) == 0:
+            return
+        yield self._create_chunk_from_accumulation()

cognee/modules/cognify/config.py CHANGED Viewed

@@ -8,12 +8,14 @@ import os
 class CognifyConfig(BaseSettings):
     classification_model: object = DefaultContentPrediction
     summarization_model: object = SummarizedContent
+    triplet_embedding: bool = False
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
     def to_dict(self) -> dict:
         return {
             "classification_model": self.classification_model,
             "summarization_model": self.summarization_model,
+            "triplet_embedding": self.triplet_embedding,
         }

cognee/modules/data/deletion/prune_system.py CHANGED Viewed

@@ -1,17 +1,67 @@
+from sqlalchemy.exc import OperationalError
+from cognee.infrastructure.databases.exceptions import EntityNotFoundError
+from cognee.context_global_variables import backend_access_control_enabled
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
 from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.infrastructure.databases.utils import (
+    get_graph_dataset_database_handler,
+    get_vector_dataset_database_handler,
+)
 from cognee.shared.cache import delete_cache
+from cognee.modules.users.models import DatasetDatabase
+from cognee.shared.logging_utils import get_logger
+logger = get_logger()
+async def prune_graph_databases():
+    db_engine = get_relational_engine()
+    try:
+        dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
+        # Go through each dataset database and delete the graph database
+        for dataset_database in dataset_databases:
+            handler = get_graph_dataset_database_handler(dataset_database)
+            await handler["handler_instance"].delete_dataset(dataset_database)
+    except (OperationalError, EntityNotFoundError) as e:
+        logger.debug(
+            "Skipping pruning of graph DB. Error when accessing dataset_database table: %s",
+            e,
+        )
+        return
+async def prune_vector_databases():
+    db_engine = get_relational_engine()
+    try:
+        dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
+        # Go through each dataset database and delete the vector database
+        for dataset_database in dataset_databases:
+            handler = get_vector_dataset_database_handler(dataset_database)
+            await handler["handler_instance"].delete_dataset(dataset_database)
+    except (OperationalError, EntityNotFoundError) as e:
+        logger.debug(
+            "Skipping pruning of vector DB. Error when accessing dataset_database table: %s",
+            e,
+        )
+        return
 async def prune_system(graph=True, vector=True, metadata=True, cache=True):
-    if graph:
+    # Note: prune system should not be available through the API, it has no permission checks and will
+    #       delete all graph and vector databases if called. It should only be used in development or testing environments.
+    if graph and not backend_access_control_enabled():
         graph_engine = await get_graph_engine()
         await graph_engine.delete_graph()
+    elif graph and backend_access_control_enabled():
+        await prune_graph_databases()
-    if vector:
+    if vector and not backend_access_control_enabled():
         vector_engine = get_vector_engine()
         await vector_engine.prune()
+    elif vector and backend_access_control_enabled():
+        await prune_vector_databases()
     if metadata:
         db_engine = get_relational_engine()

cognee/modules/data/methods/__init__.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
 from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
 from .get_data import get_data
 from .get_unique_dataset_id import get_unique_dataset_id
+from .get_unique_data_id import get_unique_data_id
 from .get_authorized_existing_datasets import get_authorized_existing_datasets
 from .get_dataset_ids import get_dataset_ids

cognee/modules/data/methods/create_dataset.py CHANGED Viewed

@@ -16,14 +16,16 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) -
             .options(joinedload(Dataset.data))
             .filter(Dataset.name == dataset_name)
             .filter(Dataset.owner_id == owner_id)
+            .filter(Dataset.tenant_id == user.tenant_id)
         )
     ).first()
     if dataset is None:
         # Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name
         dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user)
-        dataset = Dataset(id=dataset_id, name=dataset_name, data=[])
-        dataset.owner_id = owner_id
+        dataset = Dataset(
+            id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id
+        )
         session.add(dataset)

cognee/modules/data/methods/delete_dataset.py CHANGED Viewed

@@ -1,8 +1,34 @@
+from cognee.modules.users.models import DatasetDatabase
+from sqlalchemy import select
 from cognee.modules.data.models import Dataset
+from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import (
+    get_vector_dataset_database_handler,
+)
+from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import (
+    get_graph_dataset_database_handler,
+)
 from cognee.infrastructure.databases.relational import get_relational_engine
 async def delete_dataset(dataset: Dataset):
     db_engine = get_relational_engine()
+    async with db_engine.get_async_session() as session:
+        stmt = select(DatasetDatabase).where(
+            DatasetDatabase.dataset_id == dataset.id,
+        )
+        dataset_database: DatasetDatabase = await session.scalar(stmt)
+        if dataset_database:
+            graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database)
+            vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database)
+            await graph_dataset_database_handler["handler_instance"].delete_dataset(
+                dataset_database
+            )
+            await vector_dataset_database_handler["handler_instance"].delete_dataset(
+                dataset_database
+            )
+    # TODO: Remove dataset from pipeline_run_status in Data objects related to dataset as well
+    #       This blocks recreation of the dataset with the same name and data after deletion as
+    #       it's marked as completed and will be just skipped even though it's empty.
     return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)

cognee/modules/data/methods/get_dataset_ids.py CHANGED Viewed

@@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user):
             # Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.)
             user_datasets = await get_datasets(user.id)
             # Filter out non name mentioned datasets
-            dataset_ids = [dataset.id for dataset in user_datasets if dataset.name in datasets]
+            dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets]
+            # Filter out non current tenant datasets
+            dataset_ids = [
+                dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id
+            ]
         else:
             raise DatasetTypeError(
                 f"One or more of the provided dataset types is not handled: f{datasets}"

cognee/modules/data/methods/get_unique_data_id.py ADDED Viewed

@@ -0,0 +1,68 @@
+from uuid import uuid5, NAMESPACE_OID, UUID
+from sqlalchemy import select
+from cognee.modules.data.models.Data import Data
+from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.modules.users.models import User
+async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
+    """
+    Function returns a unique UUID for data based on data identifier, user id and tenant id.
+    If data with legacy ID exists, return that ID to maintain compatibility.
+    Args:
+        data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
+        user: User object adding the data
+        tenant_id: UUID of the tenant for which data is being added
+    Returns:
+        UUID: Unique identifier for the data
+    """
+    def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
+        """
+        Deprecated function, returns a unique UUID for data based on data identifier and user id.
+        Needed to support legacy data without tenant information.
+        Args:
+            data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
+            user: User object adding the data
+        Returns:
+            UUID: Unique identifier for the data
+        """
+        # return UUID hash of file contents + owner id + tenant_id
+        return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
+    def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
+        """
+        Function returns a unique UUID for data based on data identifier, user id and tenant id.
+        Args:
+            data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
+            user: User object adding the data
+            tenant_id: UUID of the tenant for which data is being added
+        Returns:
+            UUID: Unique identifier for the data
+        """
+        # return UUID hash of file contents + owner id + tenant_id
+        return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
+    # Get all possible data_id values
+    data_id = {
+        "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
+        "legacy_data_id": _get_deprecated_unique_data_id(
+            data_identifier=data_identifier, user=user
+        ),
+    }
+    # Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
+    db_engine = get_relational_engine()
+    async with db_engine.get_async_session() as session:
+        legacy_data_point = (
+            await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
+        ).scalar_one_or_none()
+        if not legacy_data_point:
+            return data_id["modern_data_id"]
+        return data_id["legacy_data_id"]

cognee/modules/data/methods/get_unique_dataset_id.py CHANGED Viewed

@@ -1,9 +1,71 @@
 from uuid import UUID, uuid5, NAMESPACE_OID
-from cognee.modules.users.models import User
 from typing import Union
+from sqlalchemy import select
+from cognee.modules.data.models.Dataset import Dataset
+from cognee.modules.users.models import User
+from cognee.infrastructure.databases.relational import get_relational_engine
 async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
-    if isinstance(dataset_name, UUID):
-        return dataset_name
-    return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
+    """
+    Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
+    If dataset with legacy ID exists, return that ID to maintain compatibility.
+    Args:
+        dataset_name: string representing the dataset name
+        user: User object adding the dataset
+        tenant_id: UUID of the tenant for which dataset is being added
+    Returns:
+        UUID: Unique identifier for the dataset
+    """
+    def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
+        """
+        Legacy function, returns a unique UUID for dataset based on dataset name and user id.
+        Needed to support legacy datasets without tenant information.
+        Args:
+            dataset_name: string representing the dataset name
+            user: Current User object adding the dataset
+        Returns:
+            UUID: Unique identifier for the dataset
+        """
+        if isinstance(dataset_name, UUID):
+            return dataset_name
+        return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
+    def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
+        """
+        Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
+        Args:
+            dataset_name: string representing the dataset name
+            user: Current User object adding the dataset
+            tenant_id: UUID of the tenant for which dataset is being added
+        Returns:
+            UUID: Unique identifier for the dataset
+        """
+        if isinstance(dataset_name, UUID):
+            return dataset_name
+        return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
+    # Get all possible dataset_id values
+    dataset_id = {
+        "modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
+        "legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
+    }
+    # Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
+    db_engine = get_relational_engine()
+    async with db_engine.get_async_session() as session:
+        legacy_dataset = (
+            await session.execute(
+                select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"])
+            )
+        ).scalar_one_or_none()
+        if not legacy_dataset:
+            return dataset_id["modern_dataset_id"]
+        return dataset_id["legacy_dataset_id"]

cognee/modules/data/models/Dataset.py CHANGED Viewed

@@ -18,6 +18,7 @@ class Dataset(Base):
     updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
     owner_id = Column(UUID, index=True)
+    tenant_id = Column(UUID, index=True, nullable=True)
     acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan")
@@ -36,5 +37,6 @@ class Dataset(Base):
             "createdAt": self.created_at.isoformat(),
             "updatedAt": self.updated_at.isoformat() if self.updated_at else None,
             "ownerId": str(self.owner_id),
+            "tenantId": str(self.tenant_id),
             "data": [data.to_json() for data in self.data],
         }

cognee/modules/data/processing/document_types/CsvDocument.py ADDED Viewed

@@ -0,0 +1,33 @@
+import io
+import csv
+from typing import Type
+from cognee.modules.chunking.Chunker import Chunker
+from cognee.infrastructure.files.utils.open_data_file import open_data_file
+from .Document import Document
+class CsvDocument(Document):
+    type: str = "csv"
+    mime_type: str = "text/csv"
+    async def read(self, chunker_cls: Type[Chunker], max_chunk_size: int):
+        async def get_text():
+            async with open_data_file(
+                self.raw_data_location, mode="r", encoding="utf-8", newline=""
+            ) as file:
+                content = file.read()
+                file_like_obj = io.StringIO(content)
+                reader = csv.DictReader(file_like_obj)
+                for row in reader:
+                    pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()]
+                    row_text = ", ".join(pairs)
+                    if not row_text.strip():
+                        break
+                    yield row_text
+        chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
+        async for chunk in chunker.read():
+            yield chunk

cognee/modules/data/processing/document_types/__init__.py CHANGED Viewed

@@ -4,3 +4,4 @@ from .TextDocument import TextDocument
 from .ImageDocument import ImageDocument
 from .AudioDocument import AudioDocument
 from .UnstructuredDocument import UnstructuredDocument
+from .CsvDocument import CsvDocument

cognee/modules/engine/models/Triplet.py ADDED Viewed

@@ -0,0 +1,9 @@
+from cognee.infrastructure.engine import DataPoint
+class Triplet(DataPoint):
+    text: str
+    from_node_id: str
+    to_node_id: str
+    metadata: dict = {"index_fields": ["text"]}

cognee/modules/engine/models/__init__.py CHANGED Viewed

@@ -7,3 +7,4 @@ from .ColumnValue import ColumnValue
 from .Timestamp import Timestamp
 from .Interval import Interval
 from .Event import Event
+from .Triplet import Triplet

cognee 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

cognee 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl