PyPI - cognee - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

cognee 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

cognee/tasks/graph/extract_graph_from_data.py CHANGED Viewed

@@ -2,9 +2,7 @@ import asyncio
 from typing import Type, List, Optional
 from pydantic import BaseModel
-from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
-from cognee.tasks.storage import index_graph_edges
 from cognee.tasks.storage.add_data_points import add_data_points
 from cognee.modules.ontology.ontology_config import Config
 from cognee.modules.ontology.get_default_ontology_resolver import (
@@ -25,6 +23,7 @@ from cognee.tasks.graph.exceptions import (
     InvalidChunkGraphInputError,
     InvalidOntologyAdapterError,
 )
+from cognee.modules.cognify.config import get_cognify_config
 async def integrate_chunk_graphs(
@@ -67,8 +66,6 @@ async def integrate_chunk_graphs(
             type(ontology_resolver).__name__ if ontology_resolver else "None"
         )
-    graph_engine = await get_graph_engine()
     if graph_model is not KnowledgeGraph:
         for chunk_index, chunk_graph in enumerate(chunk_graphs):
             data_chunks[chunk_index].contains = chunk_graph
@@ -84,12 +81,13 @@ async def integrate_chunk_graphs(
         data_chunks, chunk_graphs, ontology_resolver, existing_edges_map
     )
-    if len(graph_nodes) > 0:
-        await add_data_points(graph_nodes)
+    cognify_config = get_cognify_config()
+    embed_triplets = cognify_config.triplet_embedding
-    if len(graph_edges) > 0:
-        await graph_engine.add_edges(graph_edges)
-        await index_graph_edges(graph_edges)
+    if len(graph_nodes) > 0:
+        await add_data_points(
+            data_points=graph_nodes, custom_edges=graph_edges, embed_triplets=embed_triplets
+        )
     return data_chunks
@@ -99,6 +97,7 @@ async def extract_graph_from_data(
     graph_model: Type[BaseModel],
     config: Config = None,
     custom_prompt: Optional[str] = None,
+    **kwargs,
 ) -> List[DocumentChunk]:
     """
     Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
@@ -113,7 +112,7 @@ async def extract_graph_from_data(
     chunk_graphs = await asyncio.gather(
         *[
-            extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt)
+            extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt, **kwargs)
             for chunk in data_chunks
         ]
     )

cognee/tasks/ingestion/ingest_data.py CHANGED Viewed

@@ -99,7 +99,7 @@ async def ingest_data(
                 # data_id is the hash of original file contents + owner id to avoid duplicate data
-                data_id = ingestion.identify(classified_data, user)
+                data_id = await ingestion.identify(classified_data, user)
                 original_file_metadata = classified_data.get_metadata()
             # Find metadata from Cognee data storage text file

cognee/tasks/memify/__init__.py CHANGED Viewed

@@ -1,2 +1,4 @@
 from .extract_subgraph import extract_subgraph
 from .extract_subgraph_chunks import extract_subgraph_chunks
+from .cognify_session import cognify_session
+from .extract_user_sessions import extract_user_sessions

cognee/tasks/memify/cognify_session.py ADDED Viewed

@@ -0,0 +1,41 @@
+import cognee
+from cognee.exceptions import CogneeValidationError, CogneeSystemError
+from cognee.shared.logging_utils import get_logger
+logger = get_logger("cognify_session")
+async def cognify_session(data, dataset_id=None):
+    """
+    Process and cognify session data into the knowledge graph.
+    Adds session content to cognee with a dedicated "user_sessions" node set,
+    then triggers the cognify pipeline to extract entities and relationships
+    from the session data.
+    Args:
+        data: Session string containing Question, Context, and Answer information.
+        dataset_name: Name of dataset.
+    Raises:
+        CogneeValidationError: If data is None or empty.
+        CogneeSystemError: If cognee operations fail.
+    """
+    try:
+        if not data or (isinstance(data, str) and not data.strip()):
+            logger.warning("Empty session data provided to cognify_session task, skipping")
+            raise CogneeValidationError(message="Session data cannot be empty", log=False)
+        logger.info("Processing session data for cognification")
+        await cognee.add(data, dataset_id=dataset_id, node_set=["user_sessions_from_cache"])
+        logger.debug("Session data added to cognee with node_set: user_sessions")
+        await cognee.cognify(datasets=[dataset_id])
+        logger.info("Session data successfully cognified")
+    except CogneeValidationError:
+        raise
+    except Exception as e:
+        logger.error(f"Error cognifying session data: {str(e)}")
+        raise CogneeSystemError(message=f"Failed to cognify session data: {str(e)}", log=False)

cognee/tasks/memify/extract_user_sessions.py ADDED Viewed

@@ -0,0 +1,73 @@
+from typing import Optional, List
+from cognee.context_global_variables import session_user
+from cognee.exceptions import CogneeSystemError
+from cognee.infrastructure.databases.cache.get_cache_engine import get_cache_engine
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.users.models import User
+logger = get_logger("extract_user_sessions")
+async def extract_user_sessions(
+    data,
+    session_ids: Optional[List[str]] = None,
+):
+    """
+    Extract Q&A sessions for the current user from cache.
+    Retrieves all Q&A triplets from specified session IDs and yields them
+    as formatted strings combining question, context, and answer.
+    Args:
+        data: Data passed from memify. If empty dict ({}), no external data is provided.
+        session_ids: Optional list of specific session IDs to extract.
+    Yields:
+        String containing session ID and all Q&A pairs formatted.
+    Raises:
+        CogneeSystemError: If cache engine is unavailable or extraction fails.
+    """
+    try:
+        if not data or data == [{}]:
+            logger.info("Fetching session metadata for current user")
+        user: User = session_user.get()
+        if not user:
+            raise CogneeSystemError(message="No authenticated user found in context", log=False)
+        user_id = str(user.id)
+        cache_engine = get_cache_engine()
+        if cache_engine is None:
+            raise CogneeSystemError(
+                message="Cache engine not available for session extraction, please enable caching in order to have sessions to save",
+                log=False,
+            )
+        if session_ids:
+            for session_id in session_ids:
+                try:
+                    qa_data = await cache_engine.get_all_qas(user_id, session_id)
+                    if qa_data:
+                        logger.info(f"Extracted session {session_id} with {len(qa_data)} Q&A pairs")
+                        session_string = f"Session ID: {session_id}\n\n"
+                        for qa_pair in qa_data:
+                            question = qa_pair.get("question", "")
+                            answer = qa_pair.get("answer", "")
+                            session_string += f"Question: {question}\n\nAnswer: {answer}\n\n"
+                        yield session_string
+                except Exception as e:
+                    logger.warning(f"Failed to extract session {session_id}: {str(e)}")
+                    continue
+        else:
+            logger.info(
+                "No specific session_ids provided. Please specify which sessions to extract."
+            )
+    except CogneeSystemError:
+        raise
+    except Exception as e:
+        logger.error(f"Error extracting user sessions: {str(e)}")
+        raise CogneeSystemError(message=f"Failed to extract user sessions: {str(e)}", log=False)

cognee/tasks/memify/get_triplet_datapoints.py ADDED Viewed

@@ -0,0 +1,289 @@
+from typing import AsyncGenerator, Dict, Any, List, Optional
+from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
+from cognee.modules.engine.utils import generate_node_id
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses
+from cognee.infrastructure.engine import DataPoint
+from cognee.modules.engine.models import Triplet
+from cognee.tasks.storage import index_data_points
+logger = get_logger("get_triplet_datapoints")
+def _build_datapoint_type_index_mapping() -> Dict[str, List[str]]:
+    """
+    Build a mapping of DataPoint type names to their index_fields.
+    Returns:
+    --------
+        - Dict[str, List[str]]: Mapping of type name to list of index field names
+    """
+    logger.debug("Building DataPoint type to index_fields mapping")
+    subclasses = get_all_subclasses(DataPoint)
+    datapoint_type_index_property = {}
+    for subclass in subclasses:
+        if "metadata" in subclass.model_fields:
+            metadata_field = subclass.model_fields["metadata"]
+            default = getattr(metadata_field, "default", None)
+            if isinstance(default, dict):
+                index_fields = default.get("index_fields", [])
+                if index_fields:
+                    datapoint_type_index_property[subclass.__name__] = index_fields
+                    logger.debug(
+                        f"Registered {subclass.__name__} with index_fields: {index_fields}"
+                    )
+    logger.info(
+        f"Found {len(datapoint_type_index_property)} DataPoint types with index_fields: "
+        f"{list(datapoint_type_index_property.keys())}"
+    )
+    return datapoint_type_index_property
+def _extract_embeddable_text(node_or_edge: Dict[str, Any], index_fields: List[str]) -> str:
+    """
+    Extract and concatenate embeddable properties from a node or edge dictionary.
+    Parameters:
+    -----------
+        - node_or_edge (Dict[str, Any]): Dictionary containing node or edge properties.
+        - index_fields (List[str]): List of field names to extract and concatenate.
+    Returns:
+    --------
+        - str: Concatenated string of all embeddable property values, or empty string if none found.
+    """
+    if not node_or_edge or not index_fields:
+        return ""
+    embeddable_values = []
+    for field_name in index_fields:
+        field_value = node_or_edge.get(field_name)
+        if field_value is not None:
+            field_value = str(field_value).strip()
+            if field_value:
+                embeddable_values.append(field_value)
+    return " ".join(embeddable_values) if embeddable_values else ""
+def _extract_relationship_text(
+    relationship: Dict[str, Any], datapoint_type_index_property: Dict[str, List[str]]
+) -> str:
+    """
+    Extract relationship text from edge properties.
+    Parameters:
+    -----------
+        - relationship (Dict[str, Any]): Dictionary containing relationship properties
+        - datapoint_type_index_property (Dict[str, List[str]]): Mapping of type to index fields
+    Returns:
+    --------
+        - str: Extracted relationship text or empty string
+    """
+    if not relationship:
+        return ""
+    edge_text = relationship.get("edge_text")
+    if edge_text and isinstance(edge_text, str) and edge_text.strip():
+        return edge_text.strip()
+    # Fallback to extracting from EdgeType index_fields
+    edge_type_index_fields = datapoint_type_index_property.get("EdgeType", [])
+    return _extract_embeddable_text(relationship, edge_type_index_fields)
+def _process_single_triplet(
+    triplet_datapoint: Dict[str, Any],
+    datapoint_type_index_property: Dict[str, List[str]],
+    offset: int,
+    idx: int,
+) -> tuple[Optional[Triplet], Optional[str]]:
+    """
+    Process a single triplet and create a Triplet object.
+    Parameters:
+    -----------
+        - triplet_datapoint (Dict[str, Any]): Raw triplet data from graph engine
+        - datapoint_type_index_property (Dict[str, List[str]]): Type to index fields mapping
+        - offset (int): Current batch offset
+        - idx (int): Index within current batch
+    Returns:
+    --------
+        - tuple[Optional[Triplet], Optional[str]]: (Triplet object, error message if skipped)
+    """
+    start_node = triplet_datapoint.get("start_node", {})
+    end_node = triplet_datapoint.get("end_node", {})
+    relationship = triplet_datapoint.get("relationship_properties", {})
+    start_node_type = start_node.get("type")
+    end_node_type = end_node.get("type")
+    start_index_fields = datapoint_type_index_property.get(start_node_type, [])
+    end_index_fields = datapoint_type_index_property.get(end_node_type, [])
+    if not start_index_fields:
+        logger.debug(
+            f"No index_fields found for start_node type '{start_node_type}' in triplet {offset + idx}"
+        )
+    if not end_index_fields:
+        logger.debug(
+            f"No index_fields found for end_node type '{end_node_type}' in triplet {offset + idx}"
+        )
+    start_node_id = start_node.get("id", "")
+    end_node_id = end_node.get("id", "")
+    if not start_node_id or not end_node_id:
+        return None, (
+            f"Skipping triplet at offset {offset + idx}: missing node IDs "
+            f"(start: {start_node_id}, end: {end_node_id})"
+        )
+    relationship_text = _extract_relationship_text(relationship, datapoint_type_index_property)
+    start_node_text = _extract_embeddable_text(start_node, start_index_fields)
+    end_node_text = _extract_embeddable_text(end_node, end_index_fields)
+    if not start_node_text and not end_node_text and not relationship_text:
+        return None, (
+            f"Skipping triplet at offset {offset + idx}: empty embeddable text "
+            f"(start_node_id: {start_node_id}, end_node_id: {end_node_id})"
+        )
+    embeddable_text = f"{start_node_text}-›{relationship_text}-›{end_node_text}".strip()
+    relationship_name = relationship.get("relationship_name", "")
+    triplet_id = generate_node_id(str(start_node_id) + str(relationship_name) + str(end_node_id))
+    triplet_obj = Triplet(
+        id=triplet_id, from_node_id=start_node_id, to_node_id=end_node_id, text=embeddable_text
+    )
+    return triplet_obj, None
+async def get_triplet_datapoints(
+    data,
+    triplets_batch_size: int = 100,
+) -> AsyncGenerator[Triplet, None]:
+    """
+    Async generator that yields batches of triplet datapoints with embeddable text extracted.
+    Each triplet in the batch includes:
+    - Original triplet structure (start_node, relationship_properties, end_node)
+    - Extracted embeddable text for each element based on index_fields
+    Parameters:
+    -----------
+        - triplets_batch_size (int): Number of triplets to retrieve per batch. Default is 100.
+    Yields:
+    -------
+        - List[Dict[str, Any]]: A batch of triplets, each enriched with embeddable text.
+    """
+    if not data or data == [{}]:
+        logger.info("Fetching graph data for current user")
+    logger.info(f"Starting triplet datapoints extraction with batch size: {triplets_batch_size}")
+    graph_engine = await get_graph_engine()
+    graph_engine_type = type(graph_engine).__name__
+    logger.debug(f"Using graph engine: {graph_engine_type}")
+    if not hasattr(graph_engine, "get_triplets_batch"):
+        error_msg = f"Graph adapter {graph_engine_type} does not support get_triplets_batch method"
+        logger.error(error_msg)
+        raise NotImplementedError(error_msg)
+    datapoint_type_index_property = _build_datapoint_type_index_mapping()
+    offset = 0
+    total_triplets_processed = 0
+    batch_number = 0
+    while True:
+        try:
+            batch_number += 1
+            logger.debug(
+                f"Fetching triplet batch {batch_number} (offset: {offset}, limit: {triplets_batch_size})"
+            )
+            triplets_batch = await graph_engine.get_triplets_batch(
+                offset=offset, limit=triplets_batch_size
+            )
+            if not triplets_batch:
+                logger.info(f"No more triplets found at offset {offset}. Processing complete.")
+                break
+            logger.debug(f"Retrieved {len(triplets_batch)} triplets in batch {batch_number}")
+            triplet_datapoints = []
+            skipped_count = 0
+            for idx, triplet_datapoint in enumerate(triplets_batch):
+                try:
+                    triplet_obj, error_msg = _process_single_triplet(
+                        triplet_datapoint, datapoint_type_index_property, offset, idx
+                    )
+                    if error_msg:
+                        logger.warning(error_msg)
+                        skipped_count += 1
+                        continue
+                    if triplet_obj:
+                        triplet_datapoints.append(triplet_obj)
+                        yield triplet_obj
+                except Exception as e:
+                    logger.warning(
+                        f"Error processing triplet at offset {offset + idx}: {e}. "
+                        f"Skipping this triplet and continuing."
+                    )
+                    skipped_count += 1
+                    continue
+            if skipped_count > 0:
+                logger.warning(
+                    f"Skipped {skipped_count} out of {len(triplets_batch)} triplets in batch {batch_number}"
+                )
+            if not triplet_datapoints:
+                logger.warning(
+                    f"No valid triplet datapoints in batch {batch_number} after processing"
+                )
+                offset += len(triplets_batch)
+                if len(triplets_batch) < triplets_batch_size:
+                    break
+                continue
+            total_triplets_processed += len(triplet_datapoints)
+            logger.info(
+                f"Batch {batch_number} complete: processed {len(triplet_datapoints)} triplets "
+                f"(total processed: {total_triplets_processed})"
+            )
+            offset += len(triplets_batch)
+            if len(triplets_batch) < triplets_batch_size:
+                logger.info(
+                    f"Last batch retrieved (got {len(triplets_batch)} < {triplets_batch_size} triplets). "
+                    f"Processing complete."
+                )
+                break
+        except Exception as e:
+            logger.error(
+                f"Error retrieving triplet batch {batch_number} at offset {offset}: {e}",
+                exc_info=True,
+            )
+            raise
+    logger.info(
+        f"Triplet datapoints extraction complete. "
+        f"Processed {total_triplets_processed} triplets across {batch_number} batch(es)."
+    )

cognee/tasks/storage/add_data_points.py CHANGED Viewed

@@ -1,16 +1,23 @@
 import asyncio
-from typing import List
+from typing import List, Dict, Optional
 from cognee.infrastructure.engine import DataPoint
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.graph.utils import deduplicate_nodes_and_edges, get_graph_from_model
 from .index_data_points import index_data_points
 from .index_graph_edges import index_graph_edges
+from cognee.modules.engine.models import Triplet
+from cognee.shared.logging_utils import get_logger
 from cognee.tasks.storage.exceptions import (
     InvalidDataPointsInAddDataPointsError,
 )
+from ...modules.engine.utils import generate_node_id
+logger = get_logger("add_data_points")
-async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
+async def add_data_points(
+    data_points: List[DataPoint], custom_edges: Optional[List] = None, embed_triplets: bool = False
+) -> List[DataPoint]:
     """
     Add a batch of data points to the graph database by extracting nodes and edges,
     deduplicating them, and indexing them for retrieval.
@@ -23,6 +30,10 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
     Args:
         data_points (List[DataPoint]):
             A list of data points to process and insert into the graph.
+        custom_edges (List[tuple]): Custom edges between datapoints.
+        embed_triplets (bool):
+            If True, creates and indexes triplet embeddings from the graph structure.
+            Defaults to False.
     Returns:
         List[DataPoint]:
@@ -34,6 +45,7 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
         - Updates the node index via `index_data_points`.
         - Inserts nodes and edges into the graph engine.
         - Optionally updates the edge index via `index_graph_edges`.
+        - Optionally creates and indexes triplet embeddings if embed_triplets is True.
     """
     if not isinstance(data_points, list):
@@ -74,4 +86,132 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
     await graph_engine.add_edges(edges)
     await index_graph_edges(edges)
+    if isinstance(custom_edges, list) and custom_edges:
+        # This must be handled separately from datapoint edges, created a task in linear to dig deeper but (COG-3488)
+        await graph_engine.add_edges(custom_edges)
+        await index_graph_edges(custom_edges)
+        edges.extend(custom_edges)
+    if embed_triplets:
+        triplets = _create_triplets_from_graph(nodes, edges)
+        if triplets:
+            await index_data_points(triplets)
+            logger.info(f"Created and indexed {len(triplets)} triplets from graph structure")
     return data_points
+def _extract_embeddable_text_from_datapoint(data_point: DataPoint) -> str:
+    """
+    Extract embeddable text from a DataPoint using its index_fields metadata.
+    Uses the same approach as index_data_points.
+    Parameters:
+    -----------
+        - data_point (DataPoint): The data point to extract text from.
+    Returns:
+    --------
+        - str: Concatenated string of all embeddable property values, or empty string if none found.
+    """
+    if not data_point or not hasattr(data_point, "metadata"):
+        return ""
+    index_fields = data_point.metadata.get("index_fields", [])
+    if not index_fields:
+        return ""
+    embeddable_values = []
+    for field_name in index_fields:
+        field_value = getattr(data_point, field_name, None)
+        if field_value is not None:
+            field_value = str(field_value).strip()
+            if field_value:
+                embeddable_values.append(field_value)
+    return " ".join(embeddable_values) if embeddable_values else ""
+def _create_triplets_from_graph(nodes: List[DataPoint], edges: List[tuple]) -> List[Triplet]:
+    """
+    Create Triplet objects from graph nodes and edges.
+    This function processes graph edges and their corresponding nodes to create
+    triplet datapoints with embeddable text, similar to the triplet embeddings pipeline.
+    Parameters:
+    -----------
+        - nodes (List[DataPoint]): List of graph nodes extracted from data points
+        - edges (List[tuple]): List of edge tuples in format
+          (source_node_id, target_node_id, relationship_name, properties_dict)
+          Note: All edges including those from DocumentChunk.contains are already extracted
+          by get_graph_from_model and included in this list.
+    Returns:
+    --------
+        - List[Triplet]: List of Triplet objects ready for indexing
+    """
+    node_map: Dict[str, DataPoint] = {}
+    for node in nodes:
+        if hasattr(node, "id"):
+            node_id = str(node.id)
+            if node_id not in node_map:
+                node_map[node_id] = node
+    triplets = []
+    skipped_count = 0
+    seen_ids = set()
+    for edge_tuple in edges:
+        if len(edge_tuple) < 4:
+            continue
+        source_node_id, target_node_id, relationship_name, edge_properties = (
+            edge_tuple[0],
+            edge_tuple[1],
+            edge_tuple[2],
+            edge_tuple[3],
+        )
+        source_node = node_map.get(str(source_node_id))
+        target_node = node_map.get(str(target_node_id))
+        if not source_node or not target_node or relationship_name is None:
+            skipped_count += 1
+            continue
+        source_node_text = _extract_embeddable_text_from_datapoint(source_node)
+        target_node_text = _extract_embeddable_text_from_datapoint(target_node)
+        relationship_text = ""
+        if isinstance(edge_properties, dict):
+            edge_text = edge_properties.get("edge_text")
+            if edge_text and isinstance(edge_text, str) and edge_text.strip():
+                relationship_text = edge_text.strip()
+        if not relationship_text and relationship_name:
+            relationship_text = relationship_name
+        if not source_node_text and not relationship_text and not relationship_name:
+            skipped_count += 1
+            continue
+        embeddable_text = f"{source_node_text} -› {relationship_text}-›{target_node_text}".strip()
+        triplet_id = generate_node_id(str(source_node_id) + relationship_name + str(target_node_id))
+        if triplet_id in seen_ids:
+            continue
+        seen_ids.add(triplet_id)
+        triplets.append(
+            Triplet(
+                id=triplet_id,
+                from_node_id=str(source_node_id),
+                to_node_id=str(target_node_id),
+                text=embeddable_text,
+            )
+        )
+    return triplets

cognee 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

cognee 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl