PyPI - cognee - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

cognee 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

cognee/__init__.py +1 -0
cognee/api/health.py +2 -12
cognee/api/v1/add/add.py +46 -6
cognee/api/v1/add/routers/get_add_router.py +5 -1
cognee/api/v1/cognify/cognify.py +29 -9
cognee/api/v1/datasets/datasets.py +11 -0
cognee/api/v1/responses/default_tools.py +0 -1
cognee/api/v1/responses/dispatch_function.py +1 -1
cognee/api/v1/responses/routers/default_tools.py +0 -1
cognee/api/v1/search/search.py +11 -9
cognee/api/v1/settings/routers/get_settings_router.py +7 -1
cognee/api/v1/ui/ui.py +47 -16
cognee/api/v1/update/routers/get_update_router.py +1 -1
cognee/api/v1/update/update.py +3 -3
cognee/cli/_cognee.py +61 -10
cognee/cli/commands/add_command.py +3 -3
cognee/cli/commands/cognify_command.py +3 -3
cognee/cli/commands/config_command.py +9 -7
cognee/cli/commands/delete_command.py +3 -3
cognee/cli/commands/search_command.py +3 -7
cognee/cli/config.py +0 -1
cognee/context_global_variables.py +5 -0
cognee/exceptions/exceptions.py +1 -1
cognee/infrastructure/databases/cache/__init__.py +2 -0
cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
cognee/infrastructure/databases/cache/config.py +44 -0
cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
cognee/infrastructure/databases/exceptions/__init__.py +1 -0
cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
cognee/infrastructure/files/exceptions.py +1 -1
cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
cognee/infrastructure/files/utils/guess_file_type.py +6 -0
cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
cognee/infrastructure/loaders/LoaderEngine.py +27 -7
cognee/infrastructure/loaders/external/__init__.py +7 -0
cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
cognee/infrastructure/loaders/supported_loaders.py +7 -0
cognee/modules/data/exceptions/exceptions.py +1 -1
cognee/modules/data/methods/__init__.py +3 -0
cognee/modules/data/methods/get_dataset_data.py +4 -1
cognee/modules/data/methods/has_dataset_data.py +21 -0
cognee/modules/engine/models/TableRow.py +0 -1
cognee/modules/ingestion/save_data_to_file.py +9 -2
cognee/modules/pipelines/exceptions/exceptions.py +1 -1
cognee/modules/pipelines/operations/pipeline.py +12 -1
cognee/modules/pipelines/operations/run_tasks.py +25 -197
cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
cognee/modules/retrieval/base_graph_retriever.py +3 -1
cognee/modules/retrieval/base_retriever.py +3 -1
cognee/modules/retrieval/chunks_retriever.py +5 -1
cognee/modules/retrieval/code_retriever.py +20 -2
cognee/modules/retrieval/completion_retriever.py +50 -9
cognee/modules/retrieval/cypher_search_retriever.py +11 -1
cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
cognee/modules/retrieval/graph_completion_retriever.py +54 -10
cognee/modules/retrieval/lexical_retriever.py +20 -2
cognee/modules/retrieval/natural_language_retriever.py +10 -1
cognee/modules/retrieval/summaries_retriever.py +5 -1
cognee/modules/retrieval/temporal_retriever.py +62 -10
cognee/modules/retrieval/user_qa_feedback.py +3 -2
cognee/modules/retrieval/utils/completion.py +5 -0
cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
cognee/modules/retrieval/utils/session_cache.py +156 -0
cognee/modules/search/methods/get_search_type_tools.py +0 -5
cognee/modules/search/methods/no_access_control_search.py +12 -1
cognee/modules/search/methods/search.py +34 -2
cognee/modules/search/types/SearchType.py +0 -1
cognee/modules/settings/get_settings.py +23 -0
cognee/modules/users/methods/get_authenticated_user.py +3 -1
cognee/modules/users/methods/get_default_user.py +1 -6
cognee/modules/users/roles/methods/create_role.py +2 -2
cognee/modules/users/tenants/methods/create_tenant.py +2 -2
cognee/shared/exceptions/exceptions.py +1 -1
cognee/tasks/codingagents/coding_rule_associations.py +1 -2
cognee/tasks/documents/exceptions/exceptions.py +1 -1
cognee/tasks/graph/extract_graph_from_data.py +2 -0
cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
cognee/tasks/ingestion/ingest_data.py +11 -5
cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
cognee/tasks/storage/add_data_points.py +3 -10
cognee/tasks/storage/index_data_points.py +19 -14
cognee/tasks/storage/index_graph_edges.py +25 -11
cognee/tasks/web_scraper/__init__.py +34 -0
cognee/tasks/web_scraper/config.py +26 -0
cognee/tasks/web_scraper/default_url_crawler.py +446 -0
cognee/tasks/web_scraper/models.py +46 -0
cognee/tasks/web_scraper/types.py +4 -0
cognee/tasks/web_scraper/utils.py +142 -0
cognee/tasks/web_scraper/web_scraper_task.py +396 -0
cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
cognee/tests/subprocesses/reader.py +25 -0
cognee/tests/subprocesses/simple_cognify_1.py +31 -0
cognee/tests/subprocesses/simple_cognify_2.py +31 -0
cognee/tests/subprocesses/writer.py +32 -0
cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
cognee/tests/test_add_docling_document.py +56 -0
cognee/tests/test_chromadb.py +7 -11
cognee/tests/test_concurrent_subprocess_access.py +76 -0
cognee/tests/test_conversation_history.py +240 -0
cognee/tests/test_kuzu.py +27 -15
cognee/tests/test_lancedb.py +7 -11
cognee/tests/test_library.py +32 -2
cognee/tests/test_neo4j.py +24 -16
cognee/tests/test_neptune_analytics_vector.py +7 -11
cognee/tests/test_permissions.py +9 -13
cognee/tests/test_pgvector.py +4 -4
cognee/tests/test_remote_kuzu.py +8 -11
cognee/tests/test_s3_file_storage.py +1 -1
cognee/tests/test_search_db.py +6 -8
cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
{cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/METADATA +22 -7
{cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -128
{cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
distributed/Dockerfile +0 -3
distributed/entrypoint.py +21 -9
distributed/signal.py +5 -0
distributed/workers/data_point_saving_worker.py +64 -34
distributed/workers/graph_saving_worker.py +71 -47
cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
cognee/modules/retrieval/insights_retriever.py +0 -133
cognee/tests/test_memgraph.py +0 -109
cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
distributed/poetry.lock +0 -12238
distributed/pyproject.toml +0 -185
{cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
{cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
{cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0

cognee/modules/pipelines/operations/run_tasks.py CHANGED Viewed

@@ -4,35 +4,27 @@ import asyncio
 from uuid import UUID
 from typing import Any, List
 from functools import wraps
-from sqlalchemy import select
-import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed
 from cognee.modules.users.models import User
-from cognee.modules.data.models import Data
-from cognee.infrastructure.files.utils.open_data_file import open_data_file
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.pipelines.utils import generate_pipeline_id
 from cognee.modules.pipelines.exceptions import PipelineRunFailedError
-from cognee.tasks.ingestion import save_data_item_to_storage, resolve_data_directories
+from cognee.tasks.ingestion import resolve_data_directories
 from cognee.modules.pipelines.models.PipelineRunInfo import (
     PipelineRunCompleted,
     PipelineRunErrored,
     PipelineRunStarted,
-    PipelineRunYield,
-    PipelineRunAlreadyCompleted,
 )
-from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
 from cognee.modules.pipelines.operations import (
     log_pipeline_run_start,
     log_pipeline_run_complete,
     log_pipeline_run_error,
 )
-from .run_tasks_with_telemetry import run_tasks_with_telemetry
+from .run_tasks_data_item import run_tasks_data_item
 from ..tasks.task import Task
@@ -67,177 +59,8 @@ async def run_tasks(
     pipeline_name: str = "unknown_pipeline",
     context: dict = None,
     incremental_loading: bool = False,
+    data_per_batch: int = 20,
 ):
-    async def _run_tasks_data_item_incremental(
-        data_item,
-        dataset,
-        tasks,
-        pipeline_name,
-        pipeline_id,
-        pipeline_run_id,
-        context,
-        user,
-    ):
-        db_engine = get_relational_engine()
-        # If incremental_loading of data is set to True don't process documents already processed by pipeline
-        # If data is being added to Cognee for the first time calculate the id of the data
-        if not isinstance(data_item, Data):
-            file_path = await save_data_item_to_storage(data_item)
-            # Ingest data and add metadata
-            async with open_data_file(file_path) as file:
-                classified_data = ingestion.classify(file)
-                # data_id is the hash of file contents + owner id to avoid duplicate data
-                data_id = ingestion.identify(classified_data, user)
-        else:
-            # If data was already processed by Cognee get data id
-            data_id = data_item.id
-        # Check pipeline status, if Data already processed for pipeline before skip current processing
-        async with db_engine.get_async_session() as session:
-            data_point = (
-                await session.execute(select(Data).filter(Data.id == data_id))
-            ).scalar_one_or_none()
-            if data_point:
-                if (
-                    data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
-                    == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
-                ):
-                    yield {
-                        "run_info": PipelineRunAlreadyCompleted(
-                            pipeline_run_id=pipeline_run_id,
-                            dataset_id=dataset.id,
-                            dataset_name=dataset.name,
-                        ),
-                        "data_id": data_id,
-                    }
-                    return
-        try:
-            # Process data based on data_item and list of tasks
-            async for result in run_tasks_with_telemetry(
-                tasks=tasks,
-                data=[data_item],
-                user=user,
-                pipeline_name=pipeline_id,
-                context=context,
-            ):
-                yield PipelineRunYield(
-                    pipeline_run_id=pipeline_run_id,
-                    dataset_id=dataset.id,
-                    dataset_name=dataset.name,
-                    payload=result,
-                )
-            # Update pipeline status for Data element
-            async with db_engine.get_async_session() as session:
-                data_point = (
-                    await session.execute(select(Data).filter(Data.id == data_id))
-                ).scalar_one_or_none()
-                data_point.pipeline_status[pipeline_name] = {
-                    str(dataset.id): DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
-                }
-                await session.merge(data_point)
-                await session.commit()
-            yield {
-                "run_info": PipelineRunCompleted(
-                    pipeline_run_id=pipeline_run_id,
-                    dataset_id=dataset.id,
-                    dataset_name=dataset.name,
-                ),
-                "data_id": data_id,
-            }
-        except Exception as error:
-            # Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
-            logger.error(
-                f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
-            )
-            yield {
-                "run_info": PipelineRunErrored(
-                    pipeline_run_id=pipeline_run_id,
-                    payload=repr(error),
-                    dataset_id=dataset.id,
-                    dataset_name=dataset.name,
-                ),
-                "data_id": data_id,
-            }
-            if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
-                raise error
-    async def _run_tasks_data_item_regular(
-        data_item,
-        dataset,
-        tasks,
-        pipeline_id,
-        pipeline_run_id,
-        context,
-        user,
-    ):
-        # Process data based on data_item and list of tasks
-        async for result in run_tasks_with_telemetry(
-            tasks=tasks,
-            data=[data_item],
-            user=user,
-            pipeline_name=pipeline_id,
-            context=context,
-        ):
-            yield PipelineRunYield(
-                pipeline_run_id=pipeline_run_id,
-                dataset_id=dataset.id,
-                dataset_name=dataset.name,
-                payload=result,
-            )
-        yield {
-            "run_info": PipelineRunCompleted(
-                pipeline_run_id=pipeline_run_id,
-                dataset_id=dataset.id,
-                dataset_name=dataset.name,
-            )
-        }
-    async def _run_tasks_data_item(
-        data_item,
-        dataset,
-        tasks,
-        pipeline_name,
-        pipeline_id,
-        pipeline_run_id,
-        context,
-        user,
-        incremental_loading,
-    ):
-        # Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
-        # PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
-        result = None
-        if incremental_loading:
-            async for result in _run_tasks_data_item_incremental(
-                data_item=data_item,
-                dataset=dataset,
-                tasks=tasks,
-                pipeline_name=pipeline_name,
-                pipeline_id=pipeline_id,
-                pipeline_run_id=pipeline_run_id,
-                context=context,
-                user=user,
-            ):
-                pass
-        else:
-            async for result in _run_tasks_data_item_regular(
-                data_item=data_item,
-                dataset=dataset,
-                tasks=tasks,
-                pipeline_id=pipeline_id,
-                pipeline_run_id=pipeline_run_id,
-                context=context,
-                user=user,
-            ):
-                pass
-        return result
     if not user:
         user = await get_default_user()
@@ -266,24 +89,29 @@ async def run_tasks(
         if incremental_loading:
             data = await resolve_data_directories(data)
-        # Create async tasks per data item that will run the pipeline for the data item
-        data_item_tasks = [
-            asyncio.create_task(
-                _run_tasks_data_item(
-                    data_item,
-                    dataset,
-                    tasks,
-                    pipeline_name,
-                    pipeline_id,
-                    pipeline_run_id,
-                    context,
-                    user,
-                    incremental_loading,
+        # Create and gather batches of async tasks of data items that will run the pipeline for the data item
+        results = []
+        for start in range(0, len(data), data_per_batch):
+            data_batch = data[start : start + data_per_batch]
+            data_item_tasks = [
+                asyncio.create_task(
+                    run_tasks_data_item(
+                        data_item,
+                        dataset,
+                        tasks,
+                        pipeline_name,
+                        pipeline_id,
+                        pipeline_run_id,
+                        context,
+                        user,
+                        incremental_loading,
+                    )
                 )
-            )
-            for data_item in data
-        ]
-        results = await asyncio.gather(*data_item_tasks)
+                for data_item in data_batch
+            ]
+            results.extend(await asyncio.gather(*data_item_tasks))
         # Remove skipped data items from results
         results = [result for result in results if result]

cognee/modules/pipelines/operations/run_tasks_data_item.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""
+Data item processing functions for pipeline operations.
+This module contains reusable functions for processing individual data items
+within pipeline operations, supporting both incremental and regular processing modes.
+"""
+import os
+from typing import Any, Dict, AsyncGenerator, Optional
+from sqlalchemy import select
+import cognee.modules.ingestion as ingestion
+from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.infrastructure.files.utils.open_data_file import open_data_file
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.users.models import User
+from cognee.modules.data.models import Data, Dataset
+from cognee.tasks.ingestion import save_data_item_to_storage
+from cognee.modules.pipelines.models.PipelineRunInfo import (
+    PipelineRunCompleted,
+    PipelineRunErrored,
+    PipelineRunYield,
+    PipelineRunAlreadyCompleted,
+)
+from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
+from cognee.modules.pipelines.operations.run_tasks_with_telemetry import run_tasks_with_telemetry
+from ..tasks.task import Task
+logger = get_logger("run_tasks_data_item")
+async def run_tasks_data_item_incremental(
+    data_item: Any,
+    dataset: Dataset,
+    tasks: list[Task],
+    pipeline_name: str,
+    pipeline_id: str,
+    pipeline_run_id: str,
+    context: Optional[Dict[str, Any]],
+    user: User,
+) -> AsyncGenerator[Dict[str, Any], None]:
+    """
+    Process a single data item with incremental loading support.
+    This function handles incremental processing by checking if the data item
+    has already been processed for the given pipeline and dataset. If it has,
+    it skips processing and returns a completion status.
+    Args:
+        data_item: The data item to process
+        dataset: The dataset containing the data item
+        tasks: List of tasks to execute on the data item
+        pipeline_name: Name of the pipeline
+        pipeline_id: Unique identifier for the pipeline
+        pipeline_run_id: Unique identifier for this pipeline run
+        context: Optional context dictionary
+        user: User performing the operation
+    Yields:
+        Dict containing run_info and data_id for each processing step
+    """
+    db_engine = get_relational_engine()
+    # If incremental_loading of data is set to True don't process documents already processed by pipeline
+    # If data is being added to Cognee for the first time calculate the id of the data
+    if not isinstance(data_item, Data):
+        file_path = await save_data_item_to_storage(data_item)
+        # Ingest data and add metadata
+        async with open_data_file(file_path) as file:
+            classified_data = ingestion.classify(file)
+            # data_id is the hash of file contents + owner id to avoid duplicate data
+            data_id = ingestion.identify(classified_data, user)
+    else:
+        # If data was already processed by Cognee get data id
+        data_id = data_item.id
+    # Check pipeline status, if Data already processed for pipeline before skip current processing
+    async with db_engine.get_async_session() as session:
+        data_point = (
+            await session.execute(select(Data).filter(Data.id == data_id))
+        ).scalar_one_or_none()
+        if data_point:
+            if (
+                data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
+                == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
+            ):
+                yield {
+                    "run_info": PipelineRunAlreadyCompleted(
+                        pipeline_run_id=pipeline_run_id,
+                        dataset_id=dataset.id,
+                        dataset_name=dataset.name,
+                    ),
+                    "data_id": data_id,
+                }
+                return
+    try:
+        # Process data based on data_item and list of tasks
+        async for result in run_tasks_with_telemetry(
+            tasks=tasks,
+            data=[data_item],
+            user=user,
+            pipeline_name=pipeline_id,
+            context=context,
+        ):
+            yield PipelineRunYield(
+                pipeline_run_id=pipeline_run_id,
+                dataset_id=dataset.id,
+                dataset_name=dataset.name,
+                payload=result,
+            )
+        # Update pipeline status for Data element
+        async with db_engine.get_async_session() as session:
+            data_point = (
+                await session.execute(select(Data).filter(Data.id == data_id))
+            ).scalar_one_or_none()
+            status_for_pipeline = data_point.pipeline_status.setdefault(pipeline_name, {})
+            status_for_pipeline[str(dataset.id)] = DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
+            await session.merge(data_point)
+            await session.commit()
+        yield {
+            "run_info": PipelineRunCompleted(
+                pipeline_run_id=pipeline_run_id,
+                dataset_id=dataset.id,
+                dataset_name=dataset.name,
+            ),
+            "data_id": data_id,
+        }
+    except Exception as error:
+        # Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
+        logger.error(
+            f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
+        )
+        yield {
+            "run_info": PipelineRunErrored(
+                pipeline_run_id=pipeline_run_id,
+                payload=repr(error),
+                dataset_id=dataset.id,
+                dataset_name=dataset.name,
+            ),
+            "data_id": data_id,
+        }
+        if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
+            raise error
+async def run_tasks_data_item_regular(
+    data_item: Any,
+    dataset: Dataset,
+    tasks: list[Task],
+    pipeline_id: str,
+    pipeline_run_id: str,
+    context: Optional[Dict[str, Any]],
+    user: User,
+) -> AsyncGenerator[Dict[str, Any], None]:
+    """
+    Process a single data item in regular (non-incremental) mode.
+    This function processes a data item without checking for previous processing
+    status, executing all tasks on the data item.
+    Args:
+        data_item: The data item to process
+        dataset: The dataset containing the data item
+        tasks: List of tasks to execute on the data item
+        pipeline_id: Unique identifier for the pipeline
+        pipeline_run_id: Unique identifier for this pipeline run
+        context: Optional context dictionary
+        user: User performing the operation
+    Yields:
+        Dict containing run_info for each processing step
+    """
+    # Process data based on data_item and list of tasks
+    async for result in run_tasks_with_telemetry(
+        tasks=tasks,
+        data=[data_item],
+        user=user,
+        pipeline_name=pipeline_id,
+        context=context,
+    ):
+        yield PipelineRunYield(
+            pipeline_run_id=pipeline_run_id,
+            dataset_id=dataset.id,
+            dataset_name=dataset.name,
+            payload=result,
+        )
+    yield {
+        "run_info": PipelineRunCompleted(
+            pipeline_run_id=pipeline_run_id,
+            dataset_id=dataset.id,
+            dataset_name=dataset.name,
+        )
+    }
+async def run_tasks_data_item(
+    data_item: Any,
+    dataset: Dataset,
+    tasks: list[Task],
+    pipeline_name: str,
+    pipeline_id: str,
+    pipeline_run_id: str,
+    context: Optional[Dict[str, Any]],
+    user: User,
+    incremental_loading: bool,
+) -> Optional[Dict[str, Any]]:
+    """
+    Process a single data item, choosing between incremental and regular processing.
+    This is the main entry point for data item processing that delegates to either
+    incremental or regular processing based on the incremental_loading flag.
+    Args:
+        data_item: The data item to process
+        dataset: The dataset containing the data item
+        tasks: List of tasks to execute on the data item
+        pipeline_name: Name of the pipeline
+        pipeline_id: Unique identifier for the pipeline
+        pipeline_run_id: Unique identifier for this pipeline run
+        context: Optional context dictionary
+        user: User performing the operation
+        incremental_loading: Whether to use incremental processing
+    Returns:
+        Dict containing the final processing result, or None if processing was skipped
+    """
+    # Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
+    # PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
+    result = None
+    if incremental_loading:
+        async for result in run_tasks_data_item_incremental(
+            data_item=data_item,
+            dataset=dataset,
+            tasks=tasks,
+            pipeline_name=pipeline_name,
+            pipeline_id=pipeline_id,
+            pipeline_run_id=pipeline_run_id,
+            context=context,
+            user=user,
+        ):
+            pass
+    else:
+        async for result in run_tasks_data_item_regular(
+            data_item=data_item,
+            dataset=dataset,
+            tasks=tasks,
+            pipeline_id=pipeline_id,
+            pipeline_run_id=pipeline_run_id,
+            context=context,
+            user=user,
+        ):
+            pass
+    return result

cognee 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

cognee 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl