cognee 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +5 -1
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/datasets/datasets.py +11 -0
  7. cognee/api/v1/responses/default_tools.py +0 -1
  8. cognee/api/v1/responses/dispatch_function.py +1 -1
  9. cognee/api/v1/responses/routers/default_tools.py +0 -1
  10. cognee/api/v1/search/search.py +11 -9
  11. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  12. cognee/api/v1/ui/ui.py +47 -16
  13. cognee/api/v1/update/routers/get_update_router.py +1 -1
  14. cognee/api/v1/update/update.py +3 -3
  15. cognee/cli/_cognee.py +61 -10
  16. cognee/cli/commands/add_command.py +3 -3
  17. cognee/cli/commands/cognify_command.py +3 -3
  18. cognee/cli/commands/config_command.py +9 -7
  19. cognee/cli/commands/delete_command.py +3 -3
  20. cognee/cli/commands/search_command.py +3 -7
  21. cognee/cli/config.py +0 -1
  22. cognee/context_global_variables.py +5 -0
  23. cognee/exceptions/exceptions.py +1 -1
  24. cognee/infrastructure/databases/cache/__init__.py +2 -0
  25. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  26. cognee/infrastructure/databases/cache/config.py +44 -0
  27. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  28. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  29. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  30. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  31. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  32. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  33. cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
  34. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  35. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  36. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  37. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  38. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  39. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  40. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  41. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  42. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  43. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  44. cognee/infrastructure/files/exceptions.py +1 -1
  45. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  46. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  47. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  48. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  49. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  50. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  51. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  52. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  53. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  54. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  55. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  56. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  57. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  58. cognee/infrastructure/loaders/external/__init__.py +7 -0
  59. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  60. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  61. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  62. cognee/modules/data/exceptions/exceptions.py +1 -1
  63. cognee/modules/data/methods/__init__.py +3 -0
  64. cognee/modules/data/methods/get_dataset_data.py +4 -1
  65. cognee/modules/data/methods/has_dataset_data.py +21 -0
  66. cognee/modules/engine/models/TableRow.py +0 -1
  67. cognee/modules/ingestion/save_data_to_file.py +9 -2
  68. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  69. cognee/modules/pipelines/operations/pipeline.py +12 -1
  70. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  71. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  72. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  73. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  74. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  75. cognee/modules/retrieval/base_retriever.py +3 -1
  76. cognee/modules/retrieval/chunks_retriever.py +5 -1
  77. cognee/modules/retrieval/code_retriever.py +20 -2
  78. cognee/modules/retrieval/completion_retriever.py +50 -9
  79. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  80. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  81. cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
  82. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  83. cognee/modules/retrieval/lexical_retriever.py +20 -2
  84. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  85. cognee/modules/retrieval/summaries_retriever.py +5 -1
  86. cognee/modules/retrieval/temporal_retriever.py +62 -10
  87. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  88. cognee/modules/retrieval/utils/completion.py +5 -0
  89. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  90. cognee/modules/retrieval/utils/session_cache.py +156 -0
  91. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  92. cognee/modules/search/methods/no_access_control_search.py +12 -1
  93. cognee/modules/search/methods/search.py +34 -2
  94. cognee/modules/search/types/SearchType.py +0 -1
  95. cognee/modules/settings/get_settings.py +23 -0
  96. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  97. cognee/modules/users/methods/get_default_user.py +1 -6
  98. cognee/modules/users/roles/methods/create_role.py +2 -2
  99. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  100. cognee/shared/exceptions/exceptions.py +1 -1
  101. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  102. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  103. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  104. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  105. cognee/tasks/ingestion/ingest_data.py +11 -5
  106. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  107. cognee/tasks/storage/add_data_points.py +3 -10
  108. cognee/tasks/storage/index_data_points.py +19 -14
  109. cognee/tasks/storage/index_graph_edges.py +25 -11
  110. cognee/tasks/web_scraper/__init__.py +34 -0
  111. cognee/tasks/web_scraper/config.py +26 -0
  112. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  113. cognee/tasks/web_scraper/models.py +46 -0
  114. cognee/tasks/web_scraper/types.py +4 -0
  115. cognee/tasks/web_scraper/utils.py +142 -0
  116. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  117. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  118. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  119. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  120. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  121. cognee/tests/subprocesses/reader.py +25 -0
  122. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  123. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  124. cognee/tests/subprocesses/writer.py +32 -0
  125. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  126. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  127. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  128. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  129. cognee/tests/test_add_docling_document.py +56 -0
  130. cognee/tests/test_chromadb.py +7 -11
  131. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  132. cognee/tests/test_conversation_history.py +240 -0
  133. cognee/tests/test_kuzu.py +27 -15
  134. cognee/tests/test_lancedb.py +7 -11
  135. cognee/tests/test_library.py +32 -2
  136. cognee/tests/test_neo4j.py +24 -16
  137. cognee/tests/test_neptune_analytics_vector.py +7 -11
  138. cognee/tests/test_permissions.py +9 -13
  139. cognee/tests/test_pgvector.py +4 -4
  140. cognee/tests/test_remote_kuzu.py +8 -11
  141. cognee/tests/test_s3_file_storage.py +1 -1
  142. cognee/tests/test_search_db.py +6 -8
  143. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  144. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  145. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/METADATA +21 -6
  146. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -126
  147. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
  148. distributed/Dockerfile +0 -3
  149. distributed/entrypoint.py +21 -9
  150. distributed/signal.py +5 -0
  151. distributed/workers/data_point_saving_worker.py +64 -34
  152. distributed/workers/graph_saving_worker.py +71 -47
  153. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  154. cognee/modules/retrieval/insights_retriever.py +0 -133
  155. cognee/tests/test_memgraph.py +0 -109
  156. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  157. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
  158. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
  159. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
@@ -5,6 +5,8 @@ from uuid import UUID
5
5
  from fastapi.encoders import jsonable_encoder
6
6
  from typing import Any, List, Optional, Tuple, Type, Union
7
7
 
8
+ from cognee.infrastructure.databases.graph import get_graph_engine
9
+ from cognee.shared.logging_utils import get_logger
8
10
  from cognee.shared.utils import send_telemetry
9
11
  from cognee.context_global_variables import set_database_global_context_variables
10
12
 
@@ -27,6 +29,8 @@ from .get_search_type_tools import get_search_type_tools
27
29
  from .no_access_control_search import no_access_control_search
28
30
  from ..utils.prepare_search_result import prepare_search_result
29
31
 
32
+ logger = get_logger()
33
+
30
34
 
31
35
  async def search(
32
36
  query_text: str,
@@ -42,6 +46,7 @@ async def search(
42
46
  last_k: Optional[int] = None,
43
47
  only_context: bool = False,
44
48
  use_combined_context: bool = False,
49
+ session_id: Optional[str] = None,
45
50
  ) -> Union[CombinedSearchResult, List[SearchResult]]:
46
51
  """
47
52
 
@@ -77,6 +82,7 @@ async def search(
77
82
  last_k=last_k,
78
83
  only_context=only_context,
79
84
  use_combined_context=use_combined_context,
85
+ session_id=session_id,
80
86
  )
81
87
  else:
82
88
  search_results = [
@@ -91,6 +97,7 @@ async def search(
91
97
  save_interaction=save_interaction,
92
98
  last_k=last_k,
93
99
  only_context=only_context,
100
+ session_id=session_id,
94
101
  )
95
102
  ]
96
103
 
@@ -195,6 +202,7 @@ async def authorized_search(
195
202
  last_k: Optional[int] = None,
196
203
  only_context: bool = False,
197
204
  use_combined_context: bool = False,
205
+ session_id: Optional[str] = None,
198
206
  ) -> Union[
199
207
  Tuple[Any, Union[List[Edge], str], List[Dataset]],
200
208
  List[Tuple[Any, Union[List[Edge], str], List[Dataset]]],
@@ -221,6 +229,7 @@ async def authorized_search(
221
229
  save_interaction=save_interaction,
222
230
  last_k=last_k,
223
231
  only_context=True,
232
+ session_id=session_id,
224
233
  )
225
234
 
226
235
  context = {}
@@ -263,7 +272,7 @@ async def authorized_search(
263
272
  return combined_context
264
273
 
265
274
  combined_context = prepare_combined_context(context)
266
- completion = await get_completion(query_text, combined_context)
275
+ completion = await get_completion(query_text, combined_context, session_id=session_id)
267
276
 
268
277
  return completion, combined_context, datasets
269
278
 
@@ -280,6 +289,7 @@ async def authorized_search(
280
289
  save_interaction=save_interaction,
281
290
  last_k=last_k,
282
291
  only_context=only_context,
292
+ session_id=session_id,
283
293
  )
284
294
 
285
295
  return search_results
@@ -298,6 +308,7 @@ async def search_in_datasets_context(
298
308
  last_k: Optional[int] = None,
299
309
  only_context: bool = False,
300
310
  context: Optional[Any] = None,
311
+ session_id: Optional[str] = None,
301
312
  ) -> List[Tuple[Any, Union[str, List[Edge]], List[Dataset]]]:
302
313
  """
303
314
  Searches all provided datasets and handles setting up of appropriate database context based on permissions.
@@ -317,10 +328,30 @@ async def search_in_datasets_context(
317
328
  last_k: Optional[int] = None,
318
329
  only_context: bool = False,
319
330
  context: Optional[Any] = None,
331
+ session_id: Optional[str] = None,
320
332
  ) -> Tuple[Any, Union[str, List[Edge]], List[Dataset]]:
321
333
  # Set database configuration in async context for each dataset user has access for
322
334
  await set_database_global_context_variables(dataset.id, dataset.owner_id)
323
335
 
336
+ graph_engine = await get_graph_engine()
337
+ is_empty = await graph_engine.is_empty()
338
+
339
+ if is_empty:
340
+ # TODO: we can log here, but not all search types use graph. Still keeping this here for reviewer input
341
+ from cognee.modules.data.methods import get_dataset_data
342
+
343
+ dataset_data = await get_dataset_data(dataset.id)
344
+
345
+ if len(dataset_data) > 0:
346
+ logger.warning(
347
+ f"Dataset '{dataset.name}' has {len(dataset_data)} data item(s) but the knowledge graph is empty. "
348
+ "Please run cognify to process the data before searching."
349
+ )
350
+ else:
351
+ logger.warning(
352
+ "Search attempt on an empty knowledge graph - no data has been added to this dataset"
353
+ )
354
+
324
355
  specific_search_tools = await get_search_type_tools(
325
356
  query_type=query_type,
326
357
  query_text=query_text,
@@ -340,7 +371,7 @@ async def search_in_datasets_context(
340
371
  return None, await get_context(query_text), [dataset]
341
372
 
342
373
  search_context = context or await get_context(query_text)
343
- search_result = await get_completion(query_text, search_context)
374
+ search_result = await get_completion(query_text, search_context, session_id=session_id)
344
375
 
345
376
  return search_result, search_context, [dataset]
346
377
  else:
@@ -365,6 +396,7 @@ async def search_in_datasets_context(
365
396
  last_k=last_k,
366
397
  only_context=only_context,
367
398
  context=context,
399
+ session_id=session_id,
368
400
  )
369
401
  )
370
402
 
@@ -3,7 +3,6 @@ from enum import Enum
3
3
 
4
4
  class SearchType(Enum):
5
5
  SUMMARIES = "SUMMARIES"
6
- INSIGHTS = "INSIGHTS"
7
6
  CHUNKS = "CHUNKS"
8
7
  RAG_COMPLETION = "RAG_COMPLETION"
9
8
  GRAPH_COMPLETION = "GRAPH_COMPLETION"
@@ -15,6 +15,7 @@ class ModelName(Enum):
15
15
  ollama = "ollama"
16
16
  anthropic = "anthropic"
17
17
  gemini = "gemini"
18
+ mistral = "mistral"
18
19
 
19
20
 
20
21
  class LLMConfig(BaseModel):
@@ -72,6 +73,10 @@ def get_settings() -> SettingsDict:
72
73
  "value": "gemini",
73
74
  "label": "Gemini",
74
75
  },
76
+ {
77
+ "value": "mistral",
78
+ "label": "Mistral",
79
+ },
75
80
  ]
76
81
 
77
82
  return SettingsDict.model_validate(
@@ -134,6 +139,24 @@ def get_settings() -> SettingsDict:
134
139
  "label": "Gemini 2.0 Flash",
135
140
  },
136
141
  ],
142
+ "mistral": [
143
+ {
144
+ "value": "mistral-medium-2508",
145
+ "label": "Mistral Medium 3.1",
146
+ },
147
+ {
148
+ "value": "magistral-medium-2509",
149
+ "label": "Magistral Medium 1.2",
150
+ },
151
+ {
152
+ "value": "magistral-medium-2507",
153
+ "label": "Magistral Medium 1.1",
154
+ },
155
+ {
156
+ "value": "mistral-large-2411",
157
+ "label": "Mistral Large 2.1",
158
+ },
159
+ ],
137
160
  },
138
161
  },
139
162
  vector_db={
@@ -37,6 +37,8 @@ async def get_authenticated_user(
37
37
  except Exception as e:
38
38
  # Convert any get_default_user failure into a proper HTTP 500 error
39
39
  logger.error(f"Failed to create default user: {str(e)}")
40
- raise HTTPException(status_code=500, detail=f"Failed to create default user: {str(e)}")
40
+ raise HTTPException(
41
+ status_code=500, detail=f"Failed to create default user: {str(e)}"
42
+ ) from e
41
43
 
42
44
  return user
@@ -27,12 +27,7 @@ async def get_default_user() -> SimpleNamespace:
27
27
  if user is None:
28
28
  return await create_default_user()
29
29
 
30
- # We return a SimpleNamespace to have the same user type as our SaaS
31
- # SimpleNamespace is just a dictionary which can be accessed through attributes
32
- auth_data = SimpleNamespace(
33
- id=user.id, email=user.email, tenant_id=user.tenant_id, roles=[]
34
- )
35
- return auth_data
30
+ return user
36
31
  except Exception as error:
37
32
  if "principals" in str(error.args):
38
33
  raise DatabaseNotCreatedError() from error
@@ -40,8 +40,8 @@ async def create_role(
40
40
  # Add association directly to the association table
41
41
  role = Role(name=role_name, tenant_id=tenant.id)
42
42
  session.add(role)
43
- except IntegrityError:
44
- raise EntityAlreadyExistsError(message="Role already exists for tenant.")
43
+ except IntegrityError as e:
44
+ raise EntityAlreadyExistsError(message="Role already exists for tenant.") from e
45
45
 
46
46
  await session.commit()
47
47
  await session.refresh(role)
@@ -35,5 +35,5 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID:
35
35
  await session.merge(user)
36
36
  await session.commit()
37
37
  return tenant.id
38
- except IntegrityError:
39
- raise EntityAlreadyExistsError(message="Tenant already exists.")
38
+ except IntegrityError as e:
39
+ raise EntityAlreadyExistsError(message="Tenant already exists.") from e
@@ -7,6 +7,6 @@ class IngestionError(CogneeValidationError):
7
7
  self,
8
8
  message: str = "Failed to load data.",
9
9
  name: str = "IngestionError",
10
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
10
+ status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
11
11
  ):
12
12
  super().__init__(message, name, status_code)
@@ -124,5 +124,4 @@ async def add_rule_associations(
124
124
 
125
125
  if len(edges_to_save) > 0:
126
126
  await graph_engine.add_edges(edges_to_save)
127
-
128
- await index_graph_edges()
127
+ await index_graph_edges(edges_to_save)
@@ -12,7 +12,7 @@ class WrongDataDocumentInputError(CogneeValidationError):
12
12
  self,
13
13
  field: str,
14
14
  name: str = "WrongDataDocumentInputError",
15
- status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
15
+ status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
16
16
  ):
17
17
  message = f"Missing of invalid parameter: '{field}'."
18
18
  super().__init__(message, name, status_code)
@@ -4,6 +4,7 @@ from pydantic import BaseModel
4
4
 
5
5
  from cognee.infrastructure.databases.graph import get_graph_engine
6
6
  from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
7
+ from cognee.tasks.storage import index_graph_edges
7
8
  from cognee.tasks.storage.add_data_points import add_data_points
8
9
  from cognee.modules.ontology.ontology_config import Config
9
10
  from cognee.modules.ontology.get_default_ontology_resolver import (
@@ -88,6 +89,7 @@ async def integrate_chunk_graphs(
88
89
 
89
90
  if len(graph_edges) > 0:
90
91
  await graph_engine.add_edges(graph_edges)
92
+ await index_graph_edges(graph_edges)
91
93
 
92
94
  return data_chunks
93
95
 
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  from urllib.parse import urlparse
3
- from typing import List, Tuple
3
+ from typing import Any, List, Tuple
4
4
  from pathlib import Path
5
5
  import tempfile
6
6
 
@@ -34,7 +34,8 @@ async def pull_from_s3(file_path, destination_file) -> None:
34
34
 
35
35
 
36
36
  async def data_item_to_text_file(
37
- data_item_path: str, preferred_loaders: List[str]
37
+ data_item_path: str,
38
+ preferred_loaders: dict[str, dict[str, Any]] = None,
38
39
  ) -> Tuple[str, LoaderInterface]:
39
40
  if isinstance(data_item_path, str):
40
41
  parsed_url = urlparse(data_item_path)
@@ -74,6 +75,5 @@ async def data_item_to_text_file(
74
75
  )
75
76
  else:
76
77
  raise IngestionError(message="Local files are not accepted.")
77
-
78
78
  # data is not a supported type
79
79
  raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
@@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
6
6
  import cognee.modules.ingestion as ingestion
7
7
  from cognee.infrastructure.databases.relational import get_relational_engine
8
8
  from cognee.modules.data.models import Data
9
+ from cognee.modules.ingestion.exceptions import IngestionError
9
10
  from cognee.modules.users.models import User
10
11
  from cognee.modules.users.methods import get_default_user
11
12
  from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@@ -27,7 +28,7 @@ async def ingest_data(
27
28
  user: User,
28
29
  node_set: Optional[List[str]] = None,
29
30
  dataset_id: UUID = None,
30
- preferred_loaders: List[str] = None,
31
+ preferred_loaders: dict[str, dict[str, Any]] = None,
31
32
  ):
32
33
  if not user:
33
34
  user = await get_default_user()
@@ -44,7 +45,7 @@ async def ingest_data(
44
45
  user: User,
45
46
  node_set: Optional[List[str]] = None,
46
47
  dataset_id: UUID = None,
47
- preferred_loaders: List[str] = None,
48
+ preferred_loaders: dict[str, dict[str, Any]] = None,
48
49
  ):
49
50
  new_datapoints = []
50
51
  existing_data_points = []
@@ -77,22 +78,27 @@ async def ingest_data(
77
78
  dataset_data_map = {str(data.id): True for data in dataset_data}
78
79
 
79
80
  for data_item in data:
80
- # Get file path of data item or create a file it doesn't exist
81
+ # Get file path of data item or create a file if it doesn't exist
81
82
  original_file_path = await save_data_item_to_storage(data_item)
82
-
83
83
  # Transform file path to be OS usable
84
84
  actual_file_path = get_data_file_path(original_file_path)
85
85
 
86
86
  # Store all input data as text files in Cognee data storage
87
87
  cognee_storage_file_path, loader_engine = await data_item_to_text_file(
88
- actual_file_path, preferred_loaders
88
+ actual_file_path,
89
+ preferred_loaders,
89
90
  )
90
91
 
92
+ if loader_engine is None:
93
+ raise IngestionError("Loader cannot be None")
94
+
91
95
  # Find metadata from original file
96
+ # Standard flow: extract metadata from both original and stored files
92
97
  async with open_data_file(original_file_path) as file:
93
98
  classified_data = ingestion.classify(file)
94
99
 
95
100
  # data_id is the hash of original file contents + owner id to avoid duplicate data
101
+
96
102
  data_id = ingestion.identify(classified_data, user)
97
103
  original_file_metadata = classified_data.get_metadata()
98
104
 
@@ -8,6 +8,9 @@ from cognee.modules.ingestion import save_data_to_file
8
8
  from cognee.shared.logging_utils import get_logger
9
9
  from pydantic_settings import BaseSettings, SettingsConfigDict
10
10
 
11
+ from cognee.tasks.web_scraper.utils import fetch_page_content
12
+
13
+
11
14
  logger = get_logger()
12
15
 
13
16
 
@@ -27,6 +30,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
27
30
 
28
31
  return await get_data_from_llama_index(data_item)
29
32
 
33
+ if "docling" in str(type(data_item)):
34
+ from docling_core.types import DoclingDocument
35
+
36
+ if isinstance(data_item, DoclingDocument):
37
+ data_item = data_item.export_to_text()
38
+
30
39
  # data is a file object coming from upload.
31
40
  if hasattr(data_item, "file"):
32
41
  return await save_data_to_file(data_item.file, filename=data_item.filename)
@@ -48,7 +57,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
48
57
  # data is s3 file path
49
58
  if parsed_url.scheme == "s3":
50
59
  return data_item
51
-
60
+ elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
61
+ urls_to_page_contents = await fetch_page_content(data_item)
62
+ return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
52
63
  # data is local file path
53
64
  elif parsed_url.scheme == "file":
54
65
  if settings.accept_local_file_path:
@@ -10,9 +10,7 @@ from cognee.tasks.storage.exceptions import (
10
10
  )
11
11
 
12
12
 
13
- async def add_data_points(
14
- data_points: List[DataPoint], update_edge_collection: bool = True
15
- ) -> List[DataPoint]:
13
+ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
16
14
  """
17
15
  Add a batch of data points to the graph database by extracting nodes and edges,
18
16
  deduplicating them, and indexing them for retrieval.
@@ -25,9 +23,6 @@ async def add_data_points(
25
23
  Args:
26
24
  data_points (List[DataPoint]):
27
25
  A list of data points to process and insert into the graph.
28
- update_edge_collection (bool, optional):
29
- Whether to update the edge index after adding edges.
30
- Defaults to True.
31
26
 
32
27
  Returns:
33
28
  List[DataPoint]:
@@ -73,12 +68,10 @@ async def add_data_points(
73
68
 
74
69
  graph_engine = await get_graph_engine()
75
70
 
71
+ await graph_engine.add_nodes(nodes)
76
72
  await index_data_points(nodes)
77
73
 
78
- await graph_engine.add_nodes(nodes)
79
74
  await graph_engine.add_edges(edges)
80
-
81
- if update_edge_collection:
82
- await index_graph_edges()
75
+ await index_graph_edges(edges)
83
76
 
84
77
  return data_points
@@ -1,6 +1,6 @@
1
- from cognee.shared.logging_utils import get_logger
1
+ import asyncio
2
2
 
3
- from cognee.infrastructure.databases.exceptions import EmbeddingException
3
+ from cognee.shared.logging_utils import get_logger
4
4
  from cognee.infrastructure.databases.vector import get_vector_engine
5
5
  from cognee.infrastructure.engine import DataPoint
6
6
 
@@ -33,18 +33,23 @@ async def index_data_points(data_points: list[DataPoint]):
33
33
  indexed_data_point.metadata["index_fields"] = [field_name]
34
34
  index_points[index_name].append(indexed_data_point)
35
35
 
36
- for index_name_and_field, indexable_points in index_points.items():
37
- first_occurence = index_name_and_field.index("_")
38
- index_name = index_name_and_field[:first_occurence]
39
- field_name = index_name_and_field[first_occurence + 1 :]
40
- try:
41
- # In case the amount of indexable points is too large we need to send them in batches
42
- batch_size = vector_engine.embedding_engine.get_batch_size()
43
- for i in range(0, len(indexable_points), batch_size):
44
- batch = indexable_points[i : i + batch_size]
45
- await vector_engine.index_data_points(index_name, field_name, batch)
46
- except EmbeddingException as e:
47
- logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}")
36
+ tasks: list[asyncio.Task] = []
37
+ batch_size = vector_engine.embedding_engine.get_batch_size()
38
+
39
+ for index_name_and_field, points in index_points.items():
40
+ first = index_name_and_field.index("_")
41
+ index_name = index_name_and_field[:first]
42
+ field_name = index_name_and_field[first + 1 :]
43
+
44
+ # Create embedding requests per batch to run in parallel later
45
+ for i in range(0, len(points), batch_size):
46
+ batch = points[i : i + batch_size]
47
+ tasks.append(
48
+ asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
49
+ )
50
+
51
+ # Run all embedding requests in parallel
52
+ await asyncio.gather(*tasks)
48
53
 
49
54
  return data_points
50
55
 
@@ -1,15 +1,20 @@
1
+ import asyncio
2
+
1
3
  from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
2
- from cognee.shared.logging_utils import get_logger, ERROR
4
+ from cognee.shared.logging_utils import get_logger
3
5
  from collections import Counter
4
-
6
+ from typing import Optional, Dict, Any, List, Tuple, Union
5
7
  from cognee.infrastructure.databases.vector import get_vector_engine
6
8
  from cognee.infrastructure.databases.graph import get_graph_engine
7
9
  from cognee.modules.graph.models.EdgeType import EdgeType
10
+ from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData
8
11
 
9
- logger = get_logger(level=ERROR)
12
+ logger = get_logger()
10
13
 
11
14
 
12
- async def index_graph_edges():
15
+ async def index_graph_edges(
16
+ edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None,
17
+ ):
13
18
  """
14
19
  Indexes graph edges by creating and managing vector indexes for relationship types.
15
20
 
@@ -35,13 +40,17 @@ async def index_graph_edges():
35
40
  index_points = {}
36
41
 
37
42
  vector_engine = get_vector_engine()
38
- graph_engine = await get_graph_engine()
43
+
44
+ if edges_data is None:
45
+ graph_engine = await get_graph_engine()
46
+ _, edges_data = await graph_engine.get_graph_data()
47
+ logger.warning(
48
+ "Your graph edge embedding is deprecated, please pass edges to the index_graph_edges directly."
49
+ )
39
50
  except Exception as e:
40
51
  logger.error("Failed to initialize engines: %s", e)
41
52
  raise RuntimeError("Initialization error") from e
42
53
 
43
- _, edges_data = await graph_engine.get_graph_data()
44
-
45
54
  edge_types = Counter(
46
55
  item.get("relationship_name")
47
56
  for edge in edges_data
@@ -69,15 +78,20 @@ async def index_graph_edges():
69
78
  indexed_data_point.metadata["index_fields"] = [field_name]
70
79
  index_points[index_name].append(indexed_data_point)
71
80
 
81
+ # Get maximum batch size for embedding model
82
+ batch_size = vector_engine.embedding_engine.get_batch_size()
83
+ tasks: list[asyncio.Task] = []
84
+
72
85
  for index_name, indexable_points in index_points.items():
73
86
  index_name, field_name = index_name.split(".")
74
87
 
75
- # Get maximum batch size for embedding model
76
- batch_size = vector_engine.embedding_engine.get_batch_size()
77
- # We save the data in batches of {batch_size} to not put a lot of pressure on the database
88
+ # Create embedding tasks to run in parallel later
78
89
  for start in range(0, len(indexable_points), batch_size):
79
90
  batch = indexable_points[start : start + batch_size]
80
91
 
81
- await vector_engine.index_data_points(index_name, field_name, batch)
92
+ tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
93
+
94
+ # Start all embedding tasks and wait for completion
95
+ await asyncio.gather(*tasks)
82
96
 
83
97
  return None
@@ -0,0 +1,34 @@
1
+ """Web scraping module for cognee.
2
+
3
+ This module provides tools for scraping web content, managing scraping jobs, and storing
4
+ data in a graph database. It includes classes and functions for crawling web pages using
5
+ BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
6
+ """
7
+
8
+ from .utils import fetch_page_content
9
+ from .default_url_crawler import DefaultUrlCrawler
10
+
11
+ # Lazy import for web_scraper_task to avoid requiring apscheduler
12
+ # Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
13
+
14
+
15
+ def __getattr__(name):
16
+ """Lazy load web scraper task functions that require apscheduler."""
17
+ if name == "cron_web_scraper_task":
18
+ from .web_scraper_task import cron_web_scraper_task
19
+
20
+ return cron_web_scraper_task
21
+ elif name == "web_scraper_task":
22
+ from .web_scraper_task import web_scraper_task
23
+
24
+ return web_scraper_task
25
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
26
+
27
+
28
+ __all__ = [
29
+ "BeautifulSoupCrawler",
30
+ "fetch_page_content",
31
+ "cron_web_scraper_task",
32
+ "web_scraper_task",
33
+ "DefaultUrlCrawler",
34
+ ]
@@ -0,0 +1,26 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import Any, Dict, Optional, Literal
3
+ import os
4
+
5
+
6
+ class TavilyConfig(BaseModel):
7
+ api_key: Optional[str] = os.getenv("TAVILY_API_KEY")
8
+ extract_depth: Literal["basic", "advanced"] = "basic"
9
+ proxies: Optional[Dict[str, str]] = None
10
+ timeout: Optional[int] = Field(default=10, ge=1, le=60)
11
+
12
+
13
+ class DefaultCrawlerConfig(BaseModel):
14
+ concurrency: int = 5
15
+ crawl_delay: float = 0.5
16
+ max_crawl_delay: Optional[float] = (
17
+ 10.0 # Maximum crawl delay to respect from robots.txt (None = no limit)
18
+ )
19
+ timeout: float = 15.0
20
+ max_retries: int = 2
21
+ retry_delay_factor: float = 0.5
22
+ headers: Optional[Dict[str, str]] = None
23
+ use_playwright: bool = False
24
+ playwright_js_wait: float = 0.8
25
+ robots_cache_ttl: float = 3600.0
26
+ join_all_matches: bool = False