PyPI - cognee - Versions diffs - 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl - Mend

cognee 0.2.1.dev7py3-none-any.whl → 0.2.2.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (223) hide show

cognee/modules/pipelines/operations/run_tasks.py CHANGED Viewed

@@ -1,21 +1,31 @@
 import os
+import asyncio
 from uuid import UUID
-from typing import Any
+from typing import Any, List
 from functools import wraps
+from sqlalchemy import select
+import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed
 from cognee.modules.users.models import User
+from cognee.modules.data.models import Data
+from cognee.infrastructure.files.utils.open_data_file import open_data_file
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.pipelines.utils import generate_pipeline_id
+from cognee.modules.pipelines.exceptions import PipelineRunFailedError
+from cognee.tasks.ingestion import save_data_item_to_storage, resolve_data_directories
 from cognee.modules.pipelines.models.PipelineRunInfo import (
     PipelineRunCompleted,
     PipelineRunErrored,
     PipelineRunStarted,
     PipelineRunYield,
+    PipelineRunAlreadyCompleted,
 )
+from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
 from cognee.modules.pipelines.operations import (
     log_pipeline_run_start,
@@ -50,15 +60,186 @@ def override_run_tasks(new_gen):
 @override_run_tasks(run_tasks_distributed)
 async def run_tasks(
-    tasks: list[Task],
+    tasks: List[Task],
     dataset_id: UUID,
-    data: Any = None,
+    data: List[Any] = None,
     user: User = None,
     pipeline_name: str = "unknown_pipeline",
     context: dict = None,
+    incremental_loading: bool = False,
 ):
+    async def _run_tasks_data_item_incremental(
+        data_item,
+        dataset,
+        tasks,
+        pipeline_name,
+        pipeline_id,
+        pipeline_run_id,
+        context,
+        user,
+    ):
+        db_engine = get_relational_engine()
+        # If incremental_loading of data is set to True don't process documents already processed by pipeline
+        # If data is being added to Cognee for the first time calculate the id of the data
+        if not isinstance(data_item, Data):
+            file_path = await save_data_item_to_storage(data_item)
+            # Ingest data and add metadata
+            async with open_data_file(file_path) as file:
+                classified_data = ingestion.classify(file)
+                # data_id is the hash of file contents + owner id to avoid duplicate data
+                data_id = ingestion.identify(classified_data, user)
+        else:
+            # If data was already processed by Cognee get data id
+            data_id = data_item.id
+        # Check pipeline status, if Data already processed for pipeline before skip current processing
+        async with db_engine.get_async_session() as session:
+            data_point = (
+                await session.execute(select(Data).filter(Data.id == data_id))
+            ).scalar_one_or_none()
+            if data_point:
+                if (
+                    data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
+                    == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
+                ):
+                    yield {
+                        "run_info": PipelineRunAlreadyCompleted(
+                            pipeline_run_id=pipeline_run_id,
+                            dataset_id=dataset.id,
+                            dataset_name=dataset.name,
+                        ),
+                        "data_id": data_id,
+                    }
+                    return
+        try:
+            # Process data based on data_item and list of tasks
+            async for result in run_tasks_with_telemetry(
+                tasks=tasks,
+                data=[data_item],
+                user=user,
+                pipeline_name=pipeline_id,
+                context=context,
+            ):
+                yield PipelineRunYield(
+                    pipeline_run_id=pipeline_run_id,
+                    dataset_id=dataset.id,
+                    dataset_name=dataset.name,
+                    payload=result,
+                )
+            # Update pipeline status for Data element
+            async with db_engine.get_async_session() as session:
+                data_point = (
+                    await session.execute(select(Data).filter(Data.id == data_id))
+                ).scalar_one_or_none()
+                data_point.pipeline_status[pipeline_name] = {
+                    str(dataset.id): DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
+                }
+                await session.merge(data_point)
+                await session.commit()
+            yield {
+                "run_info": PipelineRunCompleted(
+                    pipeline_run_id=pipeline_run_id,
+                    dataset_id=dataset.id,
+                    dataset_name=dataset.name,
+                ),
+                "data_id": data_id,
+            }
+        except Exception as error:
+            # Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
+            logger.error(
+                f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
+            )
+            yield {
+                "run_info": PipelineRunErrored(
+                    pipeline_run_id=pipeline_run_id,
+                    payload=repr(error),
+                    dataset_id=dataset.id,
+                    dataset_name=dataset.name,
+                ),
+                "data_id": data_id,
+            }
+            if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
+                raise error
+    async def _run_tasks_data_item_regular(
+        data_item,
+        dataset,
+        tasks,
+        pipeline_id,
+        pipeline_run_id,
+        context,
+        user,
+    ):
+        # Process data based on data_item and list of tasks
+        async for result in run_tasks_with_telemetry(
+            tasks=tasks,
+            data=[data_item],
+            user=user,
+            pipeline_name=pipeline_id,
+            context=context,
+        ):
+            yield PipelineRunYield(
+                pipeline_run_id=pipeline_run_id,
+                dataset_id=dataset.id,
+                dataset_name=dataset.name,
+                payload=result,
+            )
+        yield {
+            "run_info": PipelineRunCompleted(
+                pipeline_run_id=pipeline_run_id,
+                dataset_id=dataset.id,
+                dataset_name=dataset.name,
+            )
+        }
+    async def _run_tasks_data_item(
+        data_item,
+        dataset,
+        tasks,
+        pipeline_name,
+        pipeline_id,
+        pipeline_run_id,
+        context,
+        user,
+        incremental_loading,
+    ):
+        # Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
+        # PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
+        result = None
+        if incremental_loading:
+            async for result in _run_tasks_data_item_incremental(
+                data_item=data_item,
+                dataset=dataset,
+                tasks=tasks,
+                pipeline_name=pipeline_name,
+                pipeline_id=pipeline_id,
+                pipeline_run_id=pipeline_run_id,
+                context=context,
+                user=user,
+            ):
+                pass
+        else:
+            async for result in _run_tasks_data_item_regular(
+                data_item=data_item,
+                dataset=dataset,
+                tasks=tasks,
+                pipeline_id=pipeline_id,
+                pipeline_run_id=pipeline_run_id,
+                context=context,
+                user=user,
+            ):
+                pass
+        return result
     if not user:
-        user = get_default_user()
+        user = await get_default_user()
     # Get Dataset object
     db_engine = get_relational_engine()
@@ -68,9 +249,7 @@ async def run_tasks(
         dataset = await session.get(Dataset, dataset_id)
     pipeline_id = generate_pipeline_id(user.id, dataset.id, pipeline_name)
     pipeline_run = await log_pipeline_run_start(pipeline_id, pipeline_name, dataset_id, data)
     pipeline_run_id = pipeline_run.pipeline_run_id
     yield PipelineRunStarted(
@@ -81,18 +260,65 @@ async def run_tasks(
     )
     try:
-        async for result in run_tasks_with_telemetry(
-            tasks=tasks,
-            data=data,
-            user=user,
-            pipeline_name=pipeline_id,
-            context=context,
-        ):
-            yield PipelineRunYield(
-                pipeline_run_id=pipeline_run_id,
-                dataset_id=dataset.id,
-                dataset_name=dataset.name,
-                payload=result,
+        if not isinstance(data, list):
+            data = [data]
+        if incremental_loading:
+            data = await resolve_data_directories(data)
+        # TODO: Return to using async.gather for data items after Cognee release
+        # # Create async tasks per data item that will run the pipeline for the data item
+        # data_item_tasks = [
+        #     asyncio.create_task(
+        #         _run_tasks_data_item(
+        #             data_item,
+        #             dataset,
+        #             tasks,
+        #             pipeline_name,
+        #             pipeline_id,
+        #             pipeline_run_id,
+        #             context,
+        #             user,
+        #             incremental_loading,
+        #         )
+        #     )
+        #     for data_item in data
+        # ]
+        # results = await asyncio.gather(*data_item_tasks)
+        # # Remove skipped data items from results
+        # results = [result for result in results if result]
+        ### TEMP sync data item handling
+        results = []
+        # Run the pipeline for each data_item sequentially, one after the other
+        for data_item in data:
+            result = await _run_tasks_data_item(
+                data_item,
+                dataset,
+                tasks,
+                pipeline_name,
+                pipeline_id,
+                pipeline_run_id,
+                context,
+                user,
+                incremental_loading,
+            )
+            # Skip items that returned a false-y value
+            if result:
+                results.append(result)
+        ### END
+        # Remove skipped data items from results
+        results = [result for result in results if result]
+        # If any data item could not be processed propagate error
+        errored_results = [
+            result for result in results if isinstance(result["run_info"], PipelineRunErrored)
+        ]
+        if errored_results:
+            raise PipelineRunFailedError(
+                message="Pipeline run failed. Data item could not be processed."
             )
         await log_pipeline_run_complete(
@@ -103,6 +329,7 @@ async def run_tasks(
             pipeline_run_id=pipeline_run_id,
             dataset_id=dataset.id,
             dataset_name=dataset.name,
+            data_ingestion_info=results,
         )
         graph_engine = await get_graph_engine()
@@ -120,9 +347,14 @@ async def run_tasks(
         yield PipelineRunErrored(
             pipeline_run_id=pipeline_run_id,
-            payload=error,
+            payload=repr(error),
             dataset_id=dataset.id,
             dataset_name=dataset.name,
+            data_ingestion_info=locals().get(
+                "results"
+            ),  # Returns results if they exist or returns None
         )
-        raise error
+        # In case of error during incremental loading of data just let the user know the pipeline Errored, don't raise error
+        if not isinstance(error, PipelineRunFailedError):
+            raise error

cognee/modules/pipelines/operations/run_tasks_distributed.py CHANGED Viewed

@@ -44,7 +44,7 @@ if modal:
 async def run_tasks_distributed(tasks, dataset_id, data, user, pipeline_name, context):
     if not user:
-        user = get_default_user()
+        user = await get_default_user()
     db_engine = get_relational_engine()
     async with db_engine.get_async_session() as session:

cognee/modules/retrieval/chunks_retriever.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from typing import Any, Optional
+from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
 from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
+logger = get_logger("ChunksRetriever")
 class ChunksRetriever(BaseRetriever):
     """
@@ -41,14 +44,22 @@ class ChunksRetriever(BaseRetriever):
             - Any: A list of document chunk payloads retrieved from the search.
         """
+        logger.info(
+            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
+        )
         vector_engine = get_vector_engine()
         try:
             found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
+            logger.info(f"Found {len(found_chunks)} chunks from vector search")
         except CollectionNotFoundError as error:
+            logger.error("DocumentChunk_text collection not found in vector database")
             raise NoDataError("No data found in the system, please add data first.") from error
-        return [result.payload for result in found_chunks]
+        chunk_payloads = [result.payload for result in found_chunks]
+        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
+        return chunk_payloads
     async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
         """
@@ -70,6 +81,17 @@ class ChunksRetriever(BaseRetriever):
             - Any: The context used for the completion or the retrieved context if none was
               provided.
         """
+        logger.info(
+            f"Starting completion generation for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
+        )
         if context is None:
+            logger.debug("No context provided, retrieving context from vector database")
             context = await self.get_context(query)
+        else:
+            logger.debug("Using provided context")
+        logger.info(
+            f"Returning context with {len(context) if isinstance(context, list) else 1} item(s)"
+        )
         return context

cognee/modules/retrieval/code_retriever.py CHANGED Viewed

@@ -3,11 +3,13 @@ import asyncio
 import aiofiles
 from pydantic import BaseModel
+from cognee.shared.logging_utils import get_logger
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.infrastructure.databases.vector import get_vector_engine
-from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.infrastructure.llm.prompts import read_query_prompt
+from cognee.infrastructure.llm.LLMGateway import LLMGateway
+logger = get_logger("CodeRetriever")
 class CodeRetriever(BaseRetriever):
@@ -35,26 +37,42 @@ class CodeRetriever(BaseRetriever):
     async def _process_query(self, query: str) -> "CodeRetriever.CodeQueryInfo":
         """Process the query using LLM to extract file names and source code parts."""
-        system_prompt = read_query_prompt("codegraph_retriever_system.txt")
-        llm_client = get_llm_client()
+        logger.debug(
+            f"Processing query with LLM: '{query[:100]}{'...' if len(query) > 100 else ''}'"
+        )
+        system_prompt = LLMGateway.read_query_prompt("codegraph_retriever_system.txt")
         try:
-            return await llm_client.acreate_structured_output(
+            result = await LLMGateway.acreate_structured_output(
                 text_input=query,
                 system_prompt=system_prompt,
                 response_model=self.CodeQueryInfo,
             )
+            logger.info(
+                f"LLM extracted {len(result.filenames)} filenames and {len(result.sourcecode)} chars of source code"
+            )
+            return result
         except Exception as e:
+            logger.error(f"Failed to retrieve structured output from LLM: {str(e)}")
             raise RuntimeError("Failed to retrieve structured output from LLM") from e
     async def get_context(self, query: str) -> Any:
         """Find relevant code files based on the query."""
+        logger.info(
+            f"Starting code retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
+        )
         if not query or not isinstance(query, str):
+            logger.error("Invalid query: must be a non-empty string")
             raise ValueError("The query must be a non-empty string.")
         try:
             vector_engine = get_vector_engine()
             graph_engine = await get_graph_engine()
+            logger.debug("Successfully initialized vector and graph engines")
         except Exception as e:
+            logger.error(f"Database initialization error: {str(e)}")
             raise RuntimeError("Database initialization error in code_graph_retriever, ") from e
         files_and_codeparts = await self._process_query(query)
@@ -63,52 +81,80 @@ class CodeRetriever(BaseRetriever):
         similar_codepieces = []
         if not files_and_codeparts.filenames or not files_and_codeparts.sourcecode:
+            logger.info("No specific files/code extracted from query, performing general search")
             for collection in self.file_name_collections:
+                logger.debug(f"Searching {collection} collection with general query")
                 search_results_file = await vector_engine.search(
                     collection, query, limit=self.top_k
                 )
+                logger.debug(f"Found {len(search_results_file)} results in {collection}")
                 for res in search_results_file:
                     similar_filenames.append(
                         {"id": res.id, "score": res.score, "payload": res.payload}
                     )
             for collection in self.classes_and_functions_collections:
+                logger.debug(f"Searching {collection} collection with general query")
                 search_results_code = await vector_engine.search(
                     collection, query, limit=self.top_k
                 )
+                logger.debug(f"Found {len(search_results_code)} results in {collection}")
                 for res in search_results_code:
                     similar_codepieces.append(
                         {"id": res.id, "score": res.score, "payload": res.payload}
                     )
         else:
+            logger.info(
+                f"Using extracted filenames ({len(files_and_codeparts.filenames)}) and source code for targeted search"
+            )
             for collection in self.file_name_collections:
                 for file_from_query in files_and_codeparts.filenames:
+                    logger.debug(f"Searching {collection} for specific file: {file_from_query}")
                     search_results_file = await vector_engine.search(
                         collection, file_from_query, limit=self.top_k
                     )
+                    logger.debug(
+                        f"Found {len(search_results_file)} results for file {file_from_query}"
+                    )
                     for res in search_results_file:
                         similar_filenames.append(
                             {"id": res.id, "score": res.score, "payload": res.payload}
                         )
             for collection in self.classes_and_functions_collections:
+                logger.debug(f"Searching {collection} with extracted source code")
                 search_results_code = await vector_engine.search(
                     collection, files_and_codeparts.sourcecode, limit=self.top_k
                 )
+                logger.debug(f"Found {len(search_results_code)} results for source code search")
                 for res in search_results_code:
                     similar_codepieces.append(
                         {"id": res.id, "score": res.score, "payload": res.payload}
                     )
+        total_items = len(similar_filenames) + len(similar_codepieces)
+        logger.info(
+            f"Total search results: {total_items} items ({len(similar_filenames)} filenames, {len(similar_codepieces)} code pieces)"
+        )
+        if total_items == 0:
+            logger.warning("No search results found, returning empty list")
+            return []
+        logger.debug("Getting graph connections for all search results")
         relevant_triplets = await asyncio.gather(
             *[
                 graph_engine.get_connections(similar_piece["id"])
                 for similar_piece in similar_filenames + similar_codepieces
             ]
         )
+        logger.info(f"Retrieved graph connections for {len(relevant_triplets)} items")
         paths = set()
-        for sublist in relevant_triplets:
+        for i, sublist in enumerate(relevant_triplets):
+            logger.debug(f"Processing connections for item {i}: {len(sublist)} connections")
             for tpl in sublist:
                 if isinstance(tpl, tuple) and len(tpl) >= 3:
                     if "file_path" in tpl[0]:
@@ -116,23 +162,31 @@ class CodeRetriever(BaseRetriever):
                     if "file_path" in tpl[2]:
                         paths.add(tpl[2]["file_path"])
+        logger.info(f"Found {len(paths)} unique file paths to read")
         retrieved_files = {}
         read_tasks = []
         for file_path in paths:
             async def read_file(fp):
                 try:
+                    logger.debug(f"Reading file: {fp}")
                     async with aiofiles.open(fp, "r", encoding="utf-8") as f:
-                        retrieved_files[fp] = await f.read()
+                        content = await f.read()
+                        retrieved_files[fp] = content
+                        logger.debug(f"Successfully read {len(content)} characters from {fp}")
                 except Exception as e:
-                    print(f"Error reading {fp}: {e}")
+                    logger.error(f"Error reading {fp}: {e}")
                     retrieved_files[fp] = ""
             read_tasks.append(read_file(file_path))
         await asyncio.gather(*read_tasks)
+        logger.info(
+            f"Successfully read {len([f for f in retrieved_files.values() if f])} files (out of {len(paths)} total)"
+        )
-        return [
+        result = [
             {
                 "name": file_path,
                 "description": file_path,
@@ -141,6 +195,9 @@ class CodeRetriever(BaseRetriever):
             for file_path in paths
         ]
+        logger.info(f"Returning {len(result)} code file contexts")
+        return result
     async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
         """Returns the code files context."""
         if context is None:

cognee/modules/retrieval/completion_retriever.py CHANGED Viewed

@@ -1,11 +1,14 @@
 from typing import Any, Optional
+from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.retrieval.utils.completion import generate_completion
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
 from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
+logger = get_logger("CompletionRetriever")
 class CompletionRetriever(BaseRetriever):
     """
@@ -56,8 +59,10 @@ class CompletionRetriever(BaseRetriever):
             # Combine all chunks text returned from vector search (number of chunks is determined by top_k
             chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks]
-            return "\n".join(chunks_payload)
+            combined_context = "\n".join(chunks_payload)
+            return combined_context
         except CollectionNotFoundError as error:
+            logger.error("DocumentChunk_text collection not found")
             raise NoDataError("No data found in the system, please add data first.") from error
     async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
@@ -70,22 +75,19 @@ class CompletionRetriever(BaseRetriever):
         Parameters:
         -----------
-            - query (str): The input query for which the completion is generated.
-            - context (Optional[Any]): Optional context to use for generating the completion; if
-              not provided, it will be retrieved using get_context. (default None)
+            - query (str): The query string to be used for generating a completion.
+            - context (Optional[Any]): Optional pre-fetched context to use for generating the
+              completion; if None, it retrieves the context for the query. (default None)
         Returns:
         --------
-            - Any: A list containing the generated completion from the LLM.
+            - Any: The generated completion based on the provided query and context.
         """
         if context is None:
             context = await self.get_context(query)
         completion = await generate_completion(
-            query=query,
-            context=context,
-            user_prompt_path=self.user_prompt_path,
-            system_prompt_path=self.system_prompt_path,
+            query, context, self.user_prompt_path, self.system_prompt_path
         )
         return [completion]

cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py CHANGED Viewed

@@ -4,8 +4,6 @@ import asyncio
 from cognee.infrastructure.context.BaseContextProvider import BaseContextProvider
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
-from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.modules.retrieval.utils.brute_force_triplet_search import (
     brute_force_triplet_search,
     format_triplets,

cognee/modules/retrieval/graph_completion_context_extension_retriever.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from typing import Any, Optional, List, Type
 from cognee.shared.logging_utils import get_logger
-from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
 from cognee.modules.retrieval.utils.completion import generate_completion
-from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
 logger = get_logger()

cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

cognee 0.2.1.dev7py3-none-any.whl → 0.2.2.dev1py3-none-any.whl