PyPI - cognee - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

cognee 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

cognee/__init__.py +1 -0
cognee/api/health.py +2 -12
cognee/api/v1/add/add.py +46 -6
cognee/api/v1/add/routers/get_add_router.py +5 -1
cognee/api/v1/cognify/cognify.py +29 -9
cognee/api/v1/datasets/datasets.py +11 -0
cognee/api/v1/responses/default_tools.py +0 -1
cognee/api/v1/responses/dispatch_function.py +1 -1
cognee/api/v1/responses/routers/default_tools.py +0 -1
cognee/api/v1/search/search.py +11 -9
cognee/api/v1/settings/routers/get_settings_router.py +7 -1
cognee/api/v1/ui/ui.py +47 -16
cognee/api/v1/update/routers/get_update_router.py +1 -1
cognee/api/v1/update/update.py +3 -3
cognee/cli/_cognee.py +61 -10
cognee/cli/commands/add_command.py +3 -3
cognee/cli/commands/cognify_command.py +3 -3
cognee/cli/commands/config_command.py +9 -7
cognee/cli/commands/delete_command.py +3 -3
cognee/cli/commands/search_command.py +3 -7
cognee/cli/config.py +0 -1
cognee/context_global_variables.py +5 -0
cognee/exceptions/exceptions.py +1 -1
cognee/infrastructure/databases/cache/__init__.py +2 -0
cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
cognee/infrastructure/databases/cache/config.py +44 -0
cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
cognee/infrastructure/databases/exceptions/__init__.py +1 -0
cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
cognee/infrastructure/files/exceptions.py +1 -1
cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
cognee/infrastructure/files/utils/guess_file_type.py +6 -0
cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
cognee/infrastructure/loaders/LoaderEngine.py +27 -7
cognee/infrastructure/loaders/external/__init__.py +7 -0
cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
cognee/infrastructure/loaders/supported_loaders.py +7 -0
cognee/modules/data/exceptions/exceptions.py +1 -1
cognee/modules/data/methods/__init__.py +3 -0
cognee/modules/data/methods/get_dataset_data.py +4 -1
cognee/modules/data/methods/has_dataset_data.py +21 -0
cognee/modules/engine/models/TableRow.py +0 -1
cognee/modules/ingestion/save_data_to_file.py +9 -2
cognee/modules/pipelines/exceptions/exceptions.py +1 -1
cognee/modules/pipelines/operations/pipeline.py +12 -1
cognee/modules/pipelines/operations/run_tasks.py +25 -197
cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
cognee/modules/retrieval/base_graph_retriever.py +3 -1
cognee/modules/retrieval/base_retriever.py +3 -1
cognee/modules/retrieval/chunks_retriever.py +5 -1
cognee/modules/retrieval/code_retriever.py +20 -2
cognee/modules/retrieval/completion_retriever.py +50 -9
cognee/modules/retrieval/cypher_search_retriever.py +11 -1
cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
cognee/modules/retrieval/graph_completion_retriever.py +54 -10
cognee/modules/retrieval/lexical_retriever.py +20 -2
cognee/modules/retrieval/natural_language_retriever.py +10 -1
cognee/modules/retrieval/summaries_retriever.py +5 -1
cognee/modules/retrieval/temporal_retriever.py +62 -10
cognee/modules/retrieval/user_qa_feedback.py +3 -2
cognee/modules/retrieval/utils/completion.py +5 -0
cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
cognee/modules/retrieval/utils/session_cache.py +156 -0
cognee/modules/search/methods/get_search_type_tools.py +0 -5
cognee/modules/search/methods/no_access_control_search.py +12 -1
cognee/modules/search/methods/search.py +34 -2
cognee/modules/search/types/SearchType.py +0 -1
cognee/modules/settings/get_settings.py +23 -0
cognee/modules/users/methods/get_authenticated_user.py +3 -1
cognee/modules/users/methods/get_default_user.py +1 -6
cognee/modules/users/roles/methods/create_role.py +2 -2
cognee/modules/users/tenants/methods/create_tenant.py +2 -2
cognee/shared/exceptions/exceptions.py +1 -1
cognee/tasks/codingagents/coding_rule_associations.py +1 -2
cognee/tasks/documents/exceptions/exceptions.py +1 -1
cognee/tasks/graph/extract_graph_from_data.py +2 -0
cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
cognee/tasks/ingestion/ingest_data.py +11 -5
cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
cognee/tasks/storage/add_data_points.py +3 -10
cognee/tasks/storage/index_data_points.py +19 -14
cognee/tasks/storage/index_graph_edges.py +25 -11
cognee/tasks/web_scraper/__init__.py +34 -0
cognee/tasks/web_scraper/config.py +26 -0
cognee/tasks/web_scraper/default_url_crawler.py +446 -0
cognee/tasks/web_scraper/models.py +46 -0
cognee/tasks/web_scraper/types.py +4 -0
cognee/tasks/web_scraper/utils.py +142 -0
cognee/tasks/web_scraper/web_scraper_task.py +396 -0
cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
cognee/tests/subprocesses/reader.py +25 -0
cognee/tests/subprocesses/simple_cognify_1.py +31 -0
cognee/tests/subprocesses/simple_cognify_2.py +31 -0
cognee/tests/subprocesses/writer.py +32 -0
cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
cognee/tests/test_add_docling_document.py +56 -0
cognee/tests/test_chromadb.py +7 -11
cognee/tests/test_concurrent_subprocess_access.py +76 -0
cognee/tests/test_conversation_history.py +240 -0
cognee/tests/test_kuzu.py +27 -15
cognee/tests/test_lancedb.py +7 -11
cognee/tests/test_library.py +32 -2
cognee/tests/test_neo4j.py +24 -16
cognee/tests/test_neptune_analytics_vector.py +7 -11
cognee/tests/test_permissions.py +9 -13
cognee/tests/test_pgvector.py +4 -4
cognee/tests/test_remote_kuzu.py +8 -11
cognee/tests/test_s3_file_storage.py +1 -1
cognee/tests/test_search_db.py +6 -8
cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
{cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/METADATA +21 -6
{cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -126
{cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
distributed/Dockerfile +0 -3
distributed/entrypoint.py +21 -9
distributed/signal.py +5 -0
distributed/workers/data_point_saving_worker.py +64 -34
distributed/workers/graph_saving_worker.py +71 -47
cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
cognee/modules/retrieval/insights_retriever.py +0 -133
cognee/tests/test_memgraph.py +0 -109
cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
{cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
{cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
{cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0

cognee/modules/search/methods/search.py CHANGED Viewed

@@ -5,6 +5,8 @@ from uuid import UUID
 from fastapi.encoders import jsonable_encoder
 from typing import Any, List, Optional, Tuple, Type, Union
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.shared.logging_utils import get_logger
 from cognee.shared.utils import send_telemetry
 from cognee.context_global_variables import set_database_global_context_variables
@@ -27,6 +29,8 @@ from .get_search_type_tools import get_search_type_tools
 from .no_access_control_search import no_access_control_search
 from ..utils.prepare_search_result import prepare_search_result
+logger = get_logger()
 async def search(
     query_text: str,
@@ -42,6 +46,7 @@ async def search(
     last_k: Optional[int] = None,
     only_context: bool = False,
     use_combined_context: bool = False,
+    session_id: Optional[str] = None,
 ) -> Union[CombinedSearchResult, List[SearchResult]]:
     """
@@ -77,6 +82,7 @@ async def search(
             last_k=last_k,
             only_context=only_context,
             use_combined_context=use_combined_context,
+            session_id=session_id,
         )
     else:
         search_results = [
@@ -91,6 +97,7 @@ async def search(
                 save_interaction=save_interaction,
                 last_k=last_k,
                 only_context=only_context,
+                session_id=session_id,
             )
         ]
@@ -195,6 +202,7 @@ async def authorized_search(
     last_k: Optional[int] = None,
     only_context: bool = False,
     use_combined_context: bool = False,
+    session_id: Optional[str] = None,
 ) -> Union[
     Tuple[Any, Union[List[Edge], str], List[Dataset]],
     List[Tuple[Any, Union[List[Edge], str], List[Dataset]]],
@@ -221,6 +229,7 @@ async def authorized_search(
             save_interaction=save_interaction,
             last_k=last_k,
             only_context=True,
+            session_id=session_id,
         )
         context = {}
@@ -263,7 +272,7 @@ async def authorized_search(
             return combined_context
         combined_context = prepare_combined_context(context)
-        completion = await get_completion(query_text, combined_context)
+        completion = await get_completion(query_text, combined_context, session_id=session_id)
         return completion, combined_context, datasets
@@ -280,6 +289,7 @@ async def authorized_search(
         save_interaction=save_interaction,
         last_k=last_k,
         only_context=only_context,
+        session_id=session_id,
     )
     return search_results
@@ -298,6 +308,7 @@ async def search_in_datasets_context(
     last_k: Optional[int] = None,
     only_context: bool = False,
     context: Optional[Any] = None,
+    session_id: Optional[str] = None,
 ) -> List[Tuple[Any, Union[str, List[Edge]], List[Dataset]]]:
     """
     Searches all provided datasets and handles setting up of appropriate database context based on permissions.
@@ -317,10 +328,30 @@ async def search_in_datasets_context(
         last_k: Optional[int] = None,
         only_context: bool = False,
         context: Optional[Any] = None,
+        session_id: Optional[str] = None,
     ) -> Tuple[Any, Union[str, List[Edge]], List[Dataset]]:
         # Set database configuration in async context for each dataset user has access for
         await set_database_global_context_variables(dataset.id, dataset.owner_id)
+        graph_engine = await get_graph_engine()
+        is_empty = await graph_engine.is_empty()
+        if is_empty:
+            # TODO: we can log here, but not all search types use graph. Still keeping this here for reviewer input
+            from cognee.modules.data.methods import get_dataset_data
+            dataset_data = await get_dataset_data(dataset.id)
+            if len(dataset_data) > 0:
+                logger.warning(
+                    f"Dataset '{dataset.name}' has {len(dataset_data)} data item(s) but the knowledge graph is empty. "
+                    "Please run cognify to process the data before searching."
+                )
+            else:
+                logger.warning(
+                    "Search attempt on an empty knowledge graph - no data has been added to this dataset"
+                )
         specific_search_tools = await get_search_type_tools(
             query_type=query_type,
             query_text=query_text,
@@ -340,7 +371,7 @@ async def search_in_datasets_context(
                 return None, await get_context(query_text), [dataset]
             search_context = context or await get_context(query_text)
-            search_result = await get_completion(query_text, search_context)
+            search_result = await get_completion(query_text, search_context, session_id=session_id)
             return search_result, search_context, [dataset]
         else:
@@ -365,6 +396,7 @@ async def search_in_datasets_context(
                 last_k=last_k,
                 only_context=only_context,
                 context=context,
+                session_id=session_id,
             )
         )

cognee/modules/search/types/SearchType.py CHANGED Viewed

@@ -3,7 +3,6 @@ from enum import Enum
 class SearchType(Enum):
     SUMMARIES = "SUMMARIES"
-    INSIGHTS = "INSIGHTS"
     CHUNKS = "CHUNKS"
     RAG_COMPLETION = "RAG_COMPLETION"
     GRAPH_COMPLETION = "GRAPH_COMPLETION"

cognee/modules/settings/get_settings.py CHANGED Viewed

@@ -15,6 +15,7 @@ class ModelName(Enum):
     ollama = "ollama"
     anthropic = "anthropic"
     gemini = "gemini"
+    mistral = "mistral"
 class LLMConfig(BaseModel):
@@ -72,6 +73,10 @@ def get_settings() -> SettingsDict:
             "value": "gemini",
             "label": "Gemini",
         },
+        {
+            "value": "mistral",
+            "label": "Mistral",
+        },
     ]
     return SettingsDict.model_validate(
@@ -134,6 +139,24 @@ def get_settings() -> SettingsDict:
                             "label": "Gemini 2.0 Flash",
                         },
                     ],
+                    "mistral": [
+                        {
+                            "value": "mistral-medium-2508",
+                            "label": "Mistral Medium 3.1",
+                        },
+                        {
+                            "value": "magistral-medium-2509",
+                            "label": "Magistral Medium 1.2",
+                        },
+                        {
+                            "value": "magistral-medium-2507",
+                            "label": "Magistral Medium 1.1",
+                        },
+                        {
+                            "value": "mistral-large-2411",
+                            "label": "Mistral Large 2.1",
+                        },
+                    ],
                 },
             },
             vector_db={

cognee/modules/users/methods/get_authenticated_user.py CHANGED Viewed

@@ -37,6 +37,8 @@ async def get_authenticated_user(
         except Exception as e:
             # Convert any get_default_user failure into a proper HTTP 500 error
             logger.error(f"Failed to create default user: {str(e)}")
-            raise HTTPException(status_code=500, detail=f"Failed to create default user: {str(e)}")
+            raise HTTPException(
+                status_code=500, detail=f"Failed to create default user: {str(e)}"
+            ) from e
     return user

cognee/modules/users/methods/get_default_user.py CHANGED Viewed

@@ -27,12 +27,7 @@ async def get_default_user() -> SimpleNamespace:
             if user is None:
                 return await create_default_user()
-            # We return a SimpleNamespace to have the same user type as our SaaS
-            # SimpleNamespace is just a dictionary which can be accessed through attributes
-            auth_data = SimpleNamespace(
-                id=user.id, email=user.email, tenant_id=user.tenant_id, roles=[]
-            )
-            return auth_data
+            return user
     except Exception as error:
         if "principals" in str(error.args):
             raise DatabaseNotCreatedError() from error

cognee/modules/users/roles/methods/create_role.py CHANGED Viewed

@@ -40,8 +40,8 @@ async def create_role(
             # Add association directly to the association table
             role = Role(name=role_name, tenant_id=tenant.id)
             session.add(role)
-        except IntegrityError:
-            raise EntityAlreadyExistsError(message="Role already exists for tenant.")
+        except IntegrityError as e:
+            raise EntityAlreadyExistsError(message="Role already exists for tenant.") from e
         await session.commit()
         await session.refresh(role)

cognee/modules/users/tenants/methods/create_tenant.py CHANGED Viewed

@@ -35,5 +35,5 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID:
             await session.merge(user)
             await session.commit()
             return tenant.id
-        except IntegrityError:
-            raise EntityAlreadyExistsError(message="Tenant already exists.")
+        except IntegrityError as e:
+            raise EntityAlreadyExistsError(message="Tenant already exists.") from e

cognee/shared/exceptions/exceptions.py CHANGED Viewed

@@ -7,6 +7,6 @@ class IngestionError(CogneeValidationError):
         self,
         message: str = "Failed to load data.",
         name: str = "IngestionError",
-        status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
     ):
         super().__init__(message, name, status_code)

cognee/tasks/codingagents/coding_rule_associations.py CHANGED Viewed

@@ -124,5 +124,4 @@ async def add_rule_associations(
     if len(edges_to_save) > 0:
         await graph_engine.add_edges(edges_to_save)
-    await index_graph_edges()
+        await index_graph_edges(edges_to_save)

cognee/tasks/documents/exceptions/exceptions.py CHANGED Viewed

@@ -12,7 +12,7 @@ class WrongDataDocumentInputError(CogneeValidationError):
         self,
         field: str,
         name: str = "WrongDataDocumentInputError",
-        status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
     ):
         message = f"Missing of invalid parameter: '{field}'."
         super().__init__(message, name, status_code)

cognee/tasks/graph/extract_graph_from_data.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pydantic import BaseModel
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
+from cognee.tasks.storage import index_graph_edges
 from cognee.tasks.storage.add_data_points import add_data_points
 from cognee.modules.ontology.ontology_config import Config
 from cognee.modules.ontology.get_default_ontology_resolver import (
@@ -88,6 +89,7 @@ async def integrate_chunk_graphs(
     if len(graph_edges) > 0:
         await graph_engine.add_edges(graph_edges)
+        await index_graph_edges(graph_edges)
     return data_chunks

cognee/tasks/ingestion/data_item_to_text_file.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from urllib.parse import urlparse
-from typing import List, Tuple
+from typing import Any, List, Tuple
 from pathlib import Path
 import tempfile
@@ -34,7 +34,8 @@ async def pull_from_s3(file_path, destination_file) -> None:
 async def data_item_to_text_file(
-    data_item_path: str, preferred_loaders: List[str]
+    data_item_path: str,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
 ) -> Tuple[str, LoaderInterface]:
     if isinstance(data_item_path, str):
         parsed_url = urlparse(data_item_path)
@@ -74,6 +75,5 @@ async def data_item_to_text_file(
                 )
             else:
                 raise IngestionError(message="Local files are not accepted.")
     # data is not a supported type
     raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")

cognee/tasks/ingestion/ingest_data.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
+from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@@ -27,7 +28,7 @@ async def ingest_data(
     user: User,
     node_set: Optional[List[str]] = None,
     dataset_id: UUID = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
 ):
     if not user:
         user = await get_default_user()
@@ -44,7 +45,7 @@ async def ingest_data(
         user: User,
         node_set: Optional[List[str]] = None,
         dataset_id: UUID = None,
-        preferred_loaders: List[str] = None,
+        preferred_loaders: dict[str, dict[str, Any]] = None,
     ):
         new_datapoints = []
         existing_data_points = []
@@ -77,22 +78,27 @@ async def ingest_data(
         dataset_data_map = {str(data.id): True for data in dataset_data}
         for data_item in data:
-            # Get file path of data item or create a file it doesn't exist
+            # Get file path of data item or create a file if it doesn't exist
             original_file_path = await save_data_item_to_storage(data_item)
             # Transform file path to be OS usable
             actual_file_path = get_data_file_path(original_file_path)
             # Store all input data as text files in Cognee data storage
             cognee_storage_file_path, loader_engine = await data_item_to_text_file(
-                actual_file_path, preferred_loaders
+                actual_file_path,
+                preferred_loaders,
             )
+            if loader_engine is None:
+                raise IngestionError("Loader cannot be None")
             # Find metadata from original file
+            # Standard flow: extract metadata from both original and stored files
             async with open_data_file(original_file_path) as file:
                 classified_data = ingestion.classify(file)
                 # data_id is the hash of original file contents + owner id to avoid duplicate data
                 data_id = ingestion.identify(classified_data, user)
                 original_file_metadata = classified_data.get_metadata()

cognee/tasks/ingestion/save_data_item_to_storage.py CHANGED Viewed

@@ -8,6 +8,9 @@ from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
+from cognee.tasks.web_scraper.utils import fetch_page_content
 logger = get_logger()
@@ -27,6 +30,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         return await get_data_from_llama_index(data_item)
+    if "docling" in str(type(data_item)):
+        from docling_core.types import DoclingDocument
+        if isinstance(data_item, DoclingDocument):
+            data_item = data_item.export_to_text()
     # data is a file object coming from upload.
     if hasattr(data_item, "file"):
         return await save_data_to_file(data_item.file, filename=data_item.filename)
@@ -48,7 +57,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         # data is s3 file path
         if parsed_url.scheme == "s3":
             return data_item
+        elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
+            urls_to_page_contents = await fetch_page_content(data_item)
+            return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:

cognee/tasks/storage/add_data_points.py CHANGED Viewed

@@ -10,9 +10,7 @@ from cognee.tasks.storage.exceptions import (
 )
-async def add_data_points(
-    data_points: List[DataPoint], update_edge_collection: bool = True
-) -> List[DataPoint]:
+async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
     """
     Add a batch of data points to the graph database by extracting nodes and edges,
     deduplicating them, and indexing them for retrieval.
@@ -25,9 +23,6 @@ async def add_data_points(
     Args:
         data_points (List[DataPoint]):
             A list of data points to process and insert into the graph.
-        update_edge_collection (bool, optional):
-            Whether to update the edge index after adding edges.
-            Defaults to True.
     Returns:
         List[DataPoint]:
@@ -73,12 +68,10 @@ async def add_data_points(
     graph_engine = await get_graph_engine()
+    await graph_engine.add_nodes(nodes)
     await index_data_points(nodes)
-    await graph_engine.add_nodes(nodes)
     await graph_engine.add_edges(edges)
-    if update_edge_collection:
-        await index_graph_edges()
+    await index_graph_edges(edges)
     return data_points

cognee/tasks/storage/index_data_points.py CHANGED Viewed

@@ -1,6 +1,6 @@
-from cognee.shared.logging_utils import get_logger
+import asyncio
-from cognee.infrastructure.databases.exceptions import EmbeddingException
+from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
@@ -33,18 +33,23 @@ async def index_data_points(data_points: list[DataPoint]):
             indexed_data_point.metadata["index_fields"] = [field_name]
             index_points[index_name].append(indexed_data_point)
-    for index_name_and_field, indexable_points in index_points.items():
-        first_occurence = index_name_and_field.index("_")
-        index_name = index_name_and_field[:first_occurence]
-        field_name = index_name_and_field[first_occurence + 1 :]
-        try:
-            # In case the amount of indexable points is too large we need to send them in batches
-            batch_size = vector_engine.embedding_engine.get_batch_size()
-            for i in range(0, len(indexable_points), batch_size):
-                batch = indexable_points[i : i + batch_size]
-                await vector_engine.index_data_points(index_name, field_name, batch)
-        except EmbeddingException as e:
-            logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}")
+    tasks: list[asyncio.Task] = []
+    batch_size = vector_engine.embedding_engine.get_batch_size()
+    for index_name_and_field, points in index_points.items():
+        first = index_name_and_field.index("_")
+        index_name = index_name_and_field[:first]
+        field_name = index_name_and_field[first + 1 :]
+        # Create embedding requests per batch to run in parallel later
+        for i in range(0, len(points), batch_size):
+            batch = points[i : i + batch_size]
+            tasks.append(
+                asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
+            )
+    # Run all embedding requests in parallel
+    await asyncio.gather(*tasks)
     return data_points

cognee/tasks/storage/index_graph_edges.py CHANGED Viewed

@@ -1,15 +1,20 @@
+import asyncio
 from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
-from cognee.shared.logging_utils import get_logger, ERROR
+from cognee.shared.logging_utils import get_logger
 from collections import Counter
+from typing import Optional, Dict, Any, List, Tuple, Union
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.graph.models.EdgeType import EdgeType
+from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData
-logger = get_logger(level=ERROR)
+logger = get_logger()
-async def index_graph_edges():
+async def index_graph_edges(
+    edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None,
+):
     """
     Indexes graph edges by creating and managing vector indexes for relationship types.
@@ -35,13 +40,17 @@ async def index_graph_edges():
         index_points = {}
         vector_engine = get_vector_engine()
-        graph_engine = await get_graph_engine()
+        if edges_data is None:
+            graph_engine = await get_graph_engine()
+            _, edges_data = await graph_engine.get_graph_data()
+            logger.warning(
+                "Your graph edge embedding is deprecated, please pass edges to the index_graph_edges directly."
+            )
     except Exception as e:
         logger.error("Failed to initialize engines: %s", e)
         raise RuntimeError("Initialization error") from e
-    _, edges_data = await graph_engine.get_graph_data()
     edge_types = Counter(
         item.get("relationship_name")
         for edge in edges_data
@@ -69,15 +78,20 @@ async def index_graph_edges():
             indexed_data_point.metadata["index_fields"] = [field_name]
             index_points[index_name].append(indexed_data_point)
+    # Get maximum batch size for embedding model
+    batch_size = vector_engine.embedding_engine.get_batch_size()
+    tasks: list[asyncio.Task] = []
     for index_name, indexable_points in index_points.items():
         index_name, field_name = index_name.split(".")
-        # Get maximum batch size for embedding model
-        batch_size = vector_engine.embedding_engine.get_batch_size()
-        # We save the data in batches of {batch_size} to not put a lot of pressure on the database
+        # Create embedding tasks to run in parallel later
         for start in range(0, len(indexable_points), batch_size):
             batch = indexable_points[start : start + batch_size]
-            await vector_engine.index_data_points(index_name, field_name, batch)
+            tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
+    # Start all embedding tasks and wait for completion
+    await asyncio.gather(*tasks)
     return None

cognee/tasks/web_scraper/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Web scraping module for cognee.
+This module provides tools for scraping web content, managing scraping jobs, and storing
+data in a graph database. It includes classes and functions for crawling web pages using
+BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
+"""
+from .utils import fetch_page_content
+from .default_url_crawler import DefaultUrlCrawler
+# Lazy import for web_scraper_task to avoid requiring apscheduler
+# Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
+def __getattr__(name):
+    """Lazy load web scraper task functions that require apscheduler."""
+    if name == "cron_web_scraper_task":
+        from .web_scraper_task import cron_web_scraper_task
+        return cron_web_scraper_task
+    elif name == "web_scraper_task":
+        from .web_scraper_task import web_scraper_task
+        return web_scraper_task
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = [
+    "BeautifulSoupCrawler",
+    "fetch_page_content",
+    "cron_web_scraper_task",
+    "web_scraper_task",
+    "DefaultUrlCrawler",
+]

cognee/tasks/web_scraper/config.py ADDED Viewed

@@ -0,0 +1,26 @@
+from pydantic import BaseModel, Field
+from typing import Any, Dict, Optional, Literal
+import os
+class TavilyConfig(BaseModel):
+    api_key: Optional[str] = os.getenv("TAVILY_API_KEY")
+    extract_depth: Literal["basic", "advanced"] = "basic"
+    proxies: Optional[Dict[str, str]] = None
+    timeout: Optional[int] = Field(default=10, ge=1, le=60)
+class DefaultCrawlerConfig(BaseModel):
+    concurrency: int = 5
+    crawl_delay: float = 0.5
+    max_crawl_delay: Optional[float] = (
+        10.0  # Maximum crawl delay to respect from robots.txt (None = no limit)
+    )
+    timeout: float = 15.0
+    max_retries: int = 2
+    retry_delay_factor: float = 0.5
+    headers: Optional[Dict[str, str]] = None
+    use_playwright: bool = False
+    playwright_js_wait: float = 0.8
+    robots_cache_ttl: float = 3600.0
+    join_all_matches: bool = False

cognee 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

cognee 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl