cognee 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +5 -1
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/ui/ui.py +47 -16
- cognee/api/v1/update/routers/get_update_router.py +1 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +5 -0
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +34 -2
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/METADATA +21 -6
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -126
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -5,6 +5,8 @@ from uuid import UUID
|
|
|
5
5
|
from fastapi.encoders import jsonable_encoder
|
|
6
6
|
from typing import Any, List, Optional, Tuple, Type, Union
|
|
7
7
|
|
|
8
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
9
|
+
from cognee.shared.logging_utils import get_logger
|
|
8
10
|
from cognee.shared.utils import send_telemetry
|
|
9
11
|
from cognee.context_global_variables import set_database_global_context_variables
|
|
10
12
|
|
|
@@ -27,6 +29,8 @@ from .get_search_type_tools import get_search_type_tools
|
|
|
27
29
|
from .no_access_control_search import no_access_control_search
|
|
28
30
|
from ..utils.prepare_search_result import prepare_search_result
|
|
29
31
|
|
|
32
|
+
logger = get_logger()
|
|
33
|
+
|
|
30
34
|
|
|
31
35
|
async def search(
|
|
32
36
|
query_text: str,
|
|
@@ -42,6 +46,7 @@ async def search(
|
|
|
42
46
|
last_k: Optional[int] = None,
|
|
43
47
|
only_context: bool = False,
|
|
44
48
|
use_combined_context: bool = False,
|
|
49
|
+
session_id: Optional[str] = None,
|
|
45
50
|
) -> Union[CombinedSearchResult, List[SearchResult]]:
|
|
46
51
|
"""
|
|
47
52
|
|
|
@@ -77,6 +82,7 @@ async def search(
|
|
|
77
82
|
last_k=last_k,
|
|
78
83
|
only_context=only_context,
|
|
79
84
|
use_combined_context=use_combined_context,
|
|
85
|
+
session_id=session_id,
|
|
80
86
|
)
|
|
81
87
|
else:
|
|
82
88
|
search_results = [
|
|
@@ -91,6 +97,7 @@ async def search(
|
|
|
91
97
|
save_interaction=save_interaction,
|
|
92
98
|
last_k=last_k,
|
|
93
99
|
only_context=only_context,
|
|
100
|
+
session_id=session_id,
|
|
94
101
|
)
|
|
95
102
|
]
|
|
96
103
|
|
|
@@ -195,6 +202,7 @@ async def authorized_search(
|
|
|
195
202
|
last_k: Optional[int] = None,
|
|
196
203
|
only_context: bool = False,
|
|
197
204
|
use_combined_context: bool = False,
|
|
205
|
+
session_id: Optional[str] = None,
|
|
198
206
|
) -> Union[
|
|
199
207
|
Tuple[Any, Union[List[Edge], str], List[Dataset]],
|
|
200
208
|
List[Tuple[Any, Union[List[Edge], str], List[Dataset]]],
|
|
@@ -221,6 +229,7 @@ async def authorized_search(
|
|
|
221
229
|
save_interaction=save_interaction,
|
|
222
230
|
last_k=last_k,
|
|
223
231
|
only_context=True,
|
|
232
|
+
session_id=session_id,
|
|
224
233
|
)
|
|
225
234
|
|
|
226
235
|
context = {}
|
|
@@ -263,7 +272,7 @@ async def authorized_search(
|
|
|
263
272
|
return combined_context
|
|
264
273
|
|
|
265
274
|
combined_context = prepare_combined_context(context)
|
|
266
|
-
completion = await get_completion(query_text, combined_context)
|
|
275
|
+
completion = await get_completion(query_text, combined_context, session_id=session_id)
|
|
267
276
|
|
|
268
277
|
return completion, combined_context, datasets
|
|
269
278
|
|
|
@@ -280,6 +289,7 @@ async def authorized_search(
|
|
|
280
289
|
save_interaction=save_interaction,
|
|
281
290
|
last_k=last_k,
|
|
282
291
|
only_context=only_context,
|
|
292
|
+
session_id=session_id,
|
|
283
293
|
)
|
|
284
294
|
|
|
285
295
|
return search_results
|
|
@@ -298,6 +308,7 @@ async def search_in_datasets_context(
|
|
|
298
308
|
last_k: Optional[int] = None,
|
|
299
309
|
only_context: bool = False,
|
|
300
310
|
context: Optional[Any] = None,
|
|
311
|
+
session_id: Optional[str] = None,
|
|
301
312
|
) -> List[Tuple[Any, Union[str, List[Edge]], List[Dataset]]]:
|
|
302
313
|
"""
|
|
303
314
|
Searches all provided datasets and handles setting up of appropriate database context based on permissions.
|
|
@@ -317,10 +328,30 @@ async def search_in_datasets_context(
|
|
|
317
328
|
last_k: Optional[int] = None,
|
|
318
329
|
only_context: bool = False,
|
|
319
330
|
context: Optional[Any] = None,
|
|
331
|
+
session_id: Optional[str] = None,
|
|
320
332
|
) -> Tuple[Any, Union[str, List[Edge]], List[Dataset]]:
|
|
321
333
|
# Set database configuration in async context for each dataset user has access for
|
|
322
334
|
await set_database_global_context_variables(dataset.id, dataset.owner_id)
|
|
323
335
|
|
|
336
|
+
graph_engine = await get_graph_engine()
|
|
337
|
+
is_empty = await graph_engine.is_empty()
|
|
338
|
+
|
|
339
|
+
if is_empty:
|
|
340
|
+
# TODO: we can log here, but not all search types use graph. Still keeping this here for reviewer input
|
|
341
|
+
from cognee.modules.data.methods import get_dataset_data
|
|
342
|
+
|
|
343
|
+
dataset_data = await get_dataset_data(dataset.id)
|
|
344
|
+
|
|
345
|
+
if len(dataset_data) > 0:
|
|
346
|
+
logger.warning(
|
|
347
|
+
f"Dataset '{dataset.name}' has {len(dataset_data)} data item(s) but the knowledge graph is empty. "
|
|
348
|
+
"Please run cognify to process the data before searching."
|
|
349
|
+
)
|
|
350
|
+
else:
|
|
351
|
+
logger.warning(
|
|
352
|
+
"Search attempt on an empty knowledge graph - no data has been added to this dataset"
|
|
353
|
+
)
|
|
354
|
+
|
|
324
355
|
specific_search_tools = await get_search_type_tools(
|
|
325
356
|
query_type=query_type,
|
|
326
357
|
query_text=query_text,
|
|
@@ -340,7 +371,7 @@ async def search_in_datasets_context(
|
|
|
340
371
|
return None, await get_context(query_text), [dataset]
|
|
341
372
|
|
|
342
373
|
search_context = context or await get_context(query_text)
|
|
343
|
-
search_result = await get_completion(query_text, search_context)
|
|
374
|
+
search_result = await get_completion(query_text, search_context, session_id=session_id)
|
|
344
375
|
|
|
345
376
|
return search_result, search_context, [dataset]
|
|
346
377
|
else:
|
|
@@ -365,6 +396,7 @@ async def search_in_datasets_context(
|
|
|
365
396
|
last_k=last_k,
|
|
366
397
|
only_context=only_context,
|
|
367
398
|
context=context,
|
|
399
|
+
session_id=session_id,
|
|
368
400
|
)
|
|
369
401
|
)
|
|
370
402
|
|
|
@@ -15,6 +15,7 @@ class ModelName(Enum):
|
|
|
15
15
|
ollama = "ollama"
|
|
16
16
|
anthropic = "anthropic"
|
|
17
17
|
gemini = "gemini"
|
|
18
|
+
mistral = "mistral"
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class LLMConfig(BaseModel):
|
|
@@ -72,6 +73,10 @@ def get_settings() -> SettingsDict:
|
|
|
72
73
|
"value": "gemini",
|
|
73
74
|
"label": "Gemini",
|
|
74
75
|
},
|
|
76
|
+
{
|
|
77
|
+
"value": "mistral",
|
|
78
|
+
"label": "Mistral",
|
|
79
|
+
},
|
|
75
80
|
]
|
|
76
81
|
|
|
77
82
|
return SettingsDict.model_validate(
|
|
@@ -134,6 +139,24 @@ def get_settings() -> SettingsDict:
|
|
|
134
139
|
"label": "Gemini 2.0 Flash",
|
|
135
140
|
},
|
|
136
141
|
],
|
|
142
|
+
"mistral": [
|
|
143
|
+
{
|
|
144
|
+
"value": "mistral-medium-2508",
|
|
145
|
+
"label": "Mistral Medium 3.1",
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
"value": "magistral-medium-2509",
|
|
149
|
+
"label": "Magistral Medium 1.2",
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"value": "magistral-medium-2507",
|
|
153
|
+
"label": "Magistral Medium 1.1",
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
"value": "mistral-large-2411",
|
|
157
|
+
"label": "Mistral Large 2.1",
|
|
158
|
+
},
|
|
159
|
+
],
|
|
137
160
|
},
|
|
138
161
|
},
|
|
139
162
|
vector_db={
|
|
@@ -37,6 +37,8 @@ async def get_authenticated_user(
|
|
|
37
37
|
except Exception as e:
|
|
38
38
|
# Convert any get_default_user failure into a proper HTTP 500 error
|
|
39
39
|
logger.error(f"Failed to create default user: {str(e)}")
|
|
40
|
-
raise HTTPException(
|
|
40
|
+
raise HTTPException(
|
|
41
|
+
status_code=500, detail=f"Failed to create default user: {str(e)}"
|
|
42
|
+
) from e
|
|
41
43
|
|
|
42
44
|
return user
|
|
@@ -27,12 +27,7 @@ async def get_default_user() -> SimpleNamespace:
|
|
|
27
27
|
if user is None:
|
|
28
28
|
return await create_default_user()
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
# SimpleNamespace is just a dictionary which can be accessed through attributes
|
|
32
|
-
auth_data = SimpleNamespace(
|
|
33
|
-
id=user.id, email=user.email, tenant_id=user.tenant_id, roles=[]
|
|
34
|
-
)
|
|
35
|
-
return auth_data
|
|
30
|
+
return user
|
|
36
31
|
except Exception as error:
|
|
37
32
|
if "principals" in str(error.args):
|
|
38
33
|
raise DatabaseNotCreatedError() from error
|
|
@@ -40,8 +40,8 @@ async def create_role(
|
|
|
40
40
|
# Add association directly to the association table
|
|
41
41
|
role = Role(name=role_name, tenant_id=tenant.id)
|
|
42
42
|
session.add(role)
|
|
43
|
-
except IntegrityError:
|
|
44
|
-
raise EntityAlreadyExistsError(message="Role already exists for tenant.")
|
|
43
|
+
except IntegrityError as e:
|
|
44
|
+
raise EntityAlreadyExistsError(message="Role already exists for tenant.") from e
|
|
45
45
|
|
|
46
46
|
await session.commit()
|
|
47
47
|
await session.refresh(role)
|
|
@@ -35,5 +35,5 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID:
|
|
|
35
35
|
await session.merge(user)
|
|
36
36
|
await session.commit()
|
|
37
37
|
return tenant.id
|
|
38
|
-
except IntegrityError:
|
|
39
|
-
raise EntityAlreadyExistsError(message="Tenant already exists.")
|
|
38
|
+
except IntegrityError as e:
|
|
39
|
+
raise EntityAlreadyExistsError(message="Tenant already exists.") from e
|
|
@@ -7,6 +7,6 @@ class IngestionError(CogneeValidationError):
|
|
|
7
7
|
self,
|
|
8
8
|
message: str = "Failed to load data.",
|
|
9
9
|
name: str = "IngestionError",
|
|
10
|
-
status_code=status.
|
|
10
|
+
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
|
11
11
|
):
|
|
12
12
|
super().__init__(message, name, status_code)
|
|
@@ -12,7 +12,7 @@ class WrongDataDocumentInputError(CogneeValidationError):
|
|
|
12
12
|
self,
|
|
13
13
|
field: str,
|
|
14
14
|
name: str = "WrongDataDocumentInputError",
|
|
15
|
-
status_code: int = status.
|
|
15
|
+
status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
|
|
16
16
|
):
|
|
17
17
|
message = f"Missing of invalid parameter: '{field}'."
|
|
18
18
|
super().__init__(message, name, status_code)
|
|
@@ -4,6 +4,7 @@ from pydantic import BaseModel
|
|
|
4
4
|
|
|
5
5
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
6
6
|
from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
|
|
7
|
+
from cognee.tasks.storage import index_graph_edges
|
|
7
8
|
from cognee.tasks.storage.add_data_points import add_data_points
|
|
8
9
|
from cognee.modules.ontology.ontology_config import Config
|
|
9
10
|
from cognee.modules.ontology.get_default_ontology_resolver import (
|
|
@@ -88,6 +89,7 @@ async def integrate_chunk_graphs(
|
|
|
88
89
|
|
|
89
90
|
if len(graph_edges) > 0:
|
|
90
91
|
await graph_engine.add_edges(graph_edges)
|
|
92
|
+
await index_graph_edges(graph_edges)
|
|
91
93
|
|
|
92
94
|
return data_chunks
|
|
93
95
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from urllib.parse import urlparse
|
|
3
|
-
from typing import List, Tuple
|
|
3
|
+
from typing import Any, List, Tuple
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
import tempfile
|
|
6
6
|
|
|
@@ -34,7 +34,8 @@ async def pull_from_s3(file_path, destination_file) -> None:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
async def data_item_to_text_file(
|
|
37
|
-
data_item_path: str,
|
|
37
|
+
data_item_path: str,
|
|
38
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
38
39
|
) -> Tuple[str, LoaderInterface]:
|
|
39
40
|
if isinstance(data_item_path, str):
|
|
40
41
|
parsed_url = urlparse(data_item_path)
|
|
@@ -74,6 +75,5 @@ async def data_item_to_text_file(
|
|
|
74
75
|
)
|
|
75
76
|
else:
|
|
76
77
|
raise IngestionError(message="Local files are not accepted.")
|
|
77
|
-
|
|
78
78
|
# data is not a supported type
|
|
79
79
|
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
|
@@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
|
|
|
6
6
|
import cognee.modules.ingestion as ingestion
|
|
7
7
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
8
8
|
from cognee.modules.data.models import Data
|
|
9
|
+
from cognee.modules.ingestion.exceptions import IngestionError
|
|
9
10
|
from cognee.modules.users.models import User
|
|
10
11
|
from cognee.modules.users.methods import get_default_user
|
|
11
12
|
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
|
@@ -27,7 +28,7 @@ async def ingest_data(
|
|
|
27
28
|
user: User,
|
|
28
29
|
node_set: Optional[List[str]] = None,
|
|
29
30
|
dataset_id: UUID = None,
|
|
30
|
-
preferred_loaders:
|
|
31
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
31
32
|
):
|
|
32
33
|
if not user:
|
|
33
34
|
user = await get_default_user()
|
|
@@ -44,7 +45,7 @@ async def ingest_data(
|
|
|
44
45
|
user: User,
|
|
45
46
|
node_set: Optional[List[str]] = None,
|
|
46
47
|
dataset_id: UUID = None,
|
|
47
|
-
preferred_loaders:
|
|
48
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
48
49
|
):
|
|
49
50
|
new_datapoints = []
|
|
50
51
|
existing_data_points = []
|
|
@@ -77,22 +78,27 @@ async def ingest_data(
|
|
|
77
78
|
dataset_data_map = {str(data.id): True for data in dataset_data}
|
|
78
79
|
|
|
79
80
|
for data_item in data:
|
|
80
|
-
# Get file path of data item or create a file it doesn't exist
|
|
81
|
+
# Get file path of data item or create a file if it doesn't exist
|
|
81
82
|
original_file_path = await save_data_item_to_storage(data_item)
|
|
82
|
-
|
|
83
83
|
# Transform file path to be OS usable
|
|
84
84
|
actual_file_path = get_data_file_path(original_file_path)
|
|
85
85
|
|
|
86
86
|
# Store all input data as text files in Cognee data storage
|
|
87
87
|
cognee_storage_file_path, loader_engine = await data_item_to_text_file(
|
|
88
|
-
actual_file_path,
|
|
88
|
+
actual_file_path,
|
|
89
|
+
preferred_loaders,
|
|
89
90
|
)
|
|
90
91
|
|
|
92
|
+
if loader_engine is None:
|
|
93
|
+
raise IngestionError("Loader cannot be None")
|
|
94
|
+
|
|
91
95
|
# Find metadata from original file
|
|
96
|
+
# Standard flow: extract metadata from both original and stored files
|
|
92
97
|
async with open_data_file(original_file_path) as file:
|
|
93
98
|
classified_data = ingestion.classify(file)
|
|
94
99
|
|
|
95
100
|
# data_id is the hash of original file contents + owner id to avoid duplicate data
|
|
101
|
+
|
|
96
102
|
data_id = ingestion.identify(classified_data, user)
|
|
97
103
|
original_file_metadata = classified_data.get_metadata()
|
|
98
104
|
|
|
@@ -8,6 +8,9 @@ from cognee.modules.ingestion import save_data_to_file
|
|
|
8
8
|
from cognee.shared.logging_utils import get_logger
|
|
9
9
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
10
10
|
|
|
11
|
+
from cognee.tasks.web_scraper.utils import fetch_page_content
|
|
12
|
+
|
|
13
|
+
|
|
11
14
|
logger = get_logger()
|
|
12
15
|
|
|
13
16
|
|
|
@@ -27,6 +30,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|
|
27
30
|
|
|
28
31
|
return await get_data_from_llama_index(data_item)
|
|
29
32
|
|
|
33
|
+
if "docling" in str(type(data_item)):
|
|
34
|
+
from docling_core.types import DoclingDocument
|
|
35
|
+
|
|
36
|
+
if isinstance(data_item, DoclingDocument):
|
|
37
|
+
data_item = data_item.export_to_text()
|
|
38
|
+
|
|
30
39
|
# data is a file object coming from upload.
|
|
31
40
|
if hasattr(data_item, "file"):
|
|
32
41
|
return await save_data_to_file(data_item.file, filename=data_item.filename)
|
|
@@ -48,7 +57,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|
|
48
57
|
# data is s3 file path
|
|
49
58
|
if parsed_url.scheme == "s3":
|
|
50
59
|
return data_item
|
|
51
|
-
|
|
60
|
+
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
|
61
|
+
urls_to_page_contents = await fetch_page_content(data_item)
|
|
62
|
+
return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
|
|
52
63
|
# data is local file path
|
|
53
64
|
elif parsed_url.scheme == "file":
|
|
54
65
|
if settings.accept_local_file_path:
|
|
@@ -10,9 +10,7 @@ from cognee.tasks.storage.exceptions import (
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
async def add_data_points(
|
|
14
|
-
data_points: List[DataPoint], update_edge_collection: bool = True
|
|
15
|
-
) -> List[DataPoint]:
|
|
13
|
+
async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
|
|
16
14
|
"""
|
|
17
15
|
Add a batch of data points to the graph database by extracting nodes and edges,
|
|
18
16
|
deduplicating them, and indexing them for retrieval.
|
|
@@ -25,9 +23,6 @@ async def add_data_points(
|
|
|
25
23
|
Args:
|
|
26
24
|
data_points (List[DataPoint]):
|
|
27
25
|
A list of data points to process and insert into the graph.
|
|
28
|
-
update_edge_collection (bool, optional):
|
|
29
|
-
Whether to update the edge index after adding edges.
|
|
30
|
-
Defaults to True.
|
|
31
26
|
|
|
32
27
|
Returns:
|
|
33
28
|
List[DataPoint]:
|
|
@@ -73,12 +68,10 @@ async def add_data_points(
|
|
|
73
68
|
|
|
74
69
|
graph_engine = await get_graph_engine()
|
|
75
70
|
|
|
71
|
+
await graph_engine.add_nodes(nodes)
|
|
76
72
|
await index_data_points(nodes)
|
|
77
73
|
|
|
78
|
-
await graph_engine.add_nodes(nodes)
|
|
79
74
|
await graph_engine.add_edges(edges)
|
|
80
|
-
|
|
81
|
-
if update_edge_collection:
|
|
82
|
-
await index_graph_edges()
|
|
75
|
+
await index_graph_edges(edges)
|
|
83
76
|
|
|
84
77
|
return data_points
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import asyncio
|
|
2
2
|
|
|
3
|
-
from cognee.
|
|
3
|
+
from cognee.shared.logging_utils import get_logger
|
|
4
4
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
5
5
|
from cognee.infrastructure.engine import DataPoint
|
|
6
6
|
|
|
@@ -33,18 +33,23 @@ async def index_data_points(data_points: list[DataPoint]):
|
|
|
33
33
|
indexed_data_point.metadata["index_fields"] = [field_name]
|
|
34
34
|
index_points[index_name].append(indexed_data_point)
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
36
|
+
tasks: list[asyncio.Task] = []
|
|
37
|
+
batch_size = vector_engine.embedding_engine.get_batch_size()
|
|
38
|
+
|
|
39
|
+
for index_name_and_field, points in index_points.items():
|
|
40
|
+
first = index_name_and_field.index("_")
|
|
41
|
+
index_name = index_name_and_field[:first]
|
|
42
|
+
field_name = index_name_and_field[first + 1 :]
|
|
43
|
+
|
|
44
|
+
# Create embedding requests per batch to run in parallel later
|
|
45
|
+
for i in range(0, len(points), batch_size):
|
|
46
|
+
batch = points[i : i + batch_size]
|
|
47
|
+
tasks.append(
|
|
48
|
+
asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Run all embedding requests in parallel
|
|
52
|
+
await asyncio.gather(*tasks)
|
|
48
53
|
|
|
49
54
|
return data_points
|
|
50
55
|
|
|
@@ -1,15 +1,20 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
1
3
|
from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
|
|
2
|
-
from cognee.shared.logging_utils import get_logger
|
|
4
|
+
from cognee.shared.logging_utils import get_logger
|
|
3
5
|
from collections import Counter
|
|
4
|
-
|
|
6
|
+
from typing import Optional, Dict, Any, List, Tuple, Union
|
|
5
7
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
6
8
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
7
9
|
from cognee.modules.graph.models.EdgeType import EdgeType
|
|
10
|
+
from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData
|
|
8
11
|
|
|
9
|
-
logger = get_logger(
|
|
12
|
+
logger = get_logger()
|
|
10
13
|
|
|
11
14
|
|
|
12
|
-
async def index_graph_edges(
|
|
15
|
+
async def index_graph_edges(
|
|
16
|
+
edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None,
|
|
17
|
+
):
|
|
13
18
|
"""
|
|
14
19
|
Indexes graph edges by creating and managing vector indexes for relationship types.
|
|
15
20
|
|
|
@@ -35,13 +40,17 @@ async def index_graph_edges():
|
|
|
35
40
|
index_points = {}
|
|
36
41
|
|
|
37
42
|
vector_engine = get_vector_engine()
|
|
38
|
-
|
|
43
|
+
|
|
44
|
+
if edges_data is None:
|
|
45
|
+
graph_engine = await get_graph_engine()
|
|
46
|
+
_, edges_data = await graph_engine.get_graph_data()
|
|
47
|
+
logger.warning(
|
|
48
|
+
"Your graph edge embedding is deprecated, please pass edges to the index_graph_edges directly."
|
|
49
|
+
)
|
|
39
50
|
except Exception as e:
|
|
40
51
|
logger.error("Failed to initialize engines: %s", e)
|
|
41
52
|
raise RuntimeError("Initialization error") from e
|
|
42
53
|
|
|
43
|
-
_, edges_data = await graph_engine.get_graph_data()
|
|
44
|
-
|
|
45
54
|
edge_types = Counter(
|
|
46
55
|
item.get("relationship_name")
|
|
47
56
|
for edge in edges_data
|
|
@@ -69,15 +78,20 @@ async def index_graph_edges():
|
|
|
69
78
|
indexed_data_point.metadata["index_fields"] = [field_name]
|
|
70
79
|
index_points[index_name].append(indexed_data_point)
|
|
71
80
|
|
|
81
|
+
# Get maximum batch size for embedding model
|
|
82
|
+
batch_size = vector_engine.embedding_engine.get_batch_size()
|
|
83
|
+
tasks: list[asyncio.Task] = []
|
|
84
|
+
|
|
72
85
|
for index_name, indexable_points in index_points.items():
|
|
73
86
|
index_name, field_name = index_name.split(".")
|
|
74
87
|
|
|
75
|
-
#
|
|
76
|
-
batch_size = vector_engine.embedding_engine.get_batch_size()
|
|
77
|
-
# We save the data in batches of {batch_size} to not put a lot of pressure on the database
|
|
88
|
+
# Create embedding tasks to run in parallel later
|
|
78
89
|
for start in range(0, len(indexable_points), batch_size):
|
|
79
90
|
batch = indexable_points[start : start + batch_size]
|
|
80
91
|
|
|
81
|
-
|
|
92
|
+
tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
|
|
93
|
+
|
|
94
|
+
# Start all embedding tasks and wait for completion
|
|
95
|
+
await asyncio.gather(*tasks)
|
|
82
96
|
|
|
83
97
|
return None
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Web scraping module for cognee.
|
|
2
|
+
|
|
3
|
+
This module provides tools for scraping web content, managing scraping jobs, and storing
|
|
4
|
+
data in a graph database. It includes classes and functions for crawling web pages using
|
|
5
|
+
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .utils import fetch_page_content
|
|
9
|
+
from .default_url_crawler import DefaultUrlCrawler
|
|
10
|
+
|
|
11
|
+
# Lazy import for web_scraper_task to avoid requiring apscheduler
|
|
12
|
+
# Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __getattr__(name):
|
|
16
|
+
"""Lazy load web scraper task functions that require apscheduler."""
|
|
17
|
+
if name == "cron_web_scraper_task":
|
|
18
|
+
from .web_scraper_task import cron_web_scraper_task
|
|
19
|
+
|
|
20
|
+
return cron_web_scraper_task
|
|
21
|
+
elif name == "web_scraper_task":
|
|
22
|
+
from .web_scraper_task import web_scraper_task
|
|
23
|
+
|
|
24
|
+
return web_scraper_task
|
|
25
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"BeautifulSoupCrawler",
|
|
30
|
+
"fetch_page_content",
|
|
31
|
+
"cron_web_scraper_task",
|
|
32
|
+
"web_scraper_task",
|
|
33
|
+
"DefaultUrlCrawler",
|
|
34
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import Any, Dict, Optional, Literal
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TavilyConfig(BaseModel):
|
|
7
|
+
api_key: Optional[str] = os.getenv("TAVILY_API_KEY")
|
|
8
|
+
extract_depth: Literal["basic", "advanced"] = "basic"
|
|
9
|
+
proxies: Optional[Dict[str, str]] = None
|
|
10
|
+
timeout: Optional[int] = Field(default=10, ge=1, le=60)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DefaultCrawlerConfig(BaseModel):
|
|
14
|
+
concurrency: int = 5
|
|
15
|
+
crawl_delay: float = 0.5
|
|
16
|
+
max_crawl_delay: Optional[float] = (
|
|
17
|
+
10.0 # Maximum crawl delay to respect from robots.txt (None = no limit)
|
|
18
|
+
)
|
|
19
|
+
timeout: float = 15.0
|
|
20
|
+
max_retries: int = 2
|
|
21
|
+
retry_delay_factor: float = 0.5
|
|
22
|
+
headers: Optional[Dict[str, str]] = None
|
|
23
|
+
use_playwright: bool = False
|
|
24
|
+
playwright_js_wait: float = 0.8
|
|
25
|
+
robots_cache_ttl: float = 3600.0
|
|
26
|
+
join_all_matches: bool = False
|