cognee 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/client.py +9 -5
  3. cognee/api/v1/add/add.py +2 -1
  4. cognee/api/v1/add/routers/get_add_router.py +3 -1
  5. cognee/api/v1/cognify/cognify.py +24 -16
  6. cognee/api/v1/cognify/routers/__init__.py +0 -1
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +30 -1
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
  9. cognee/api/v1/ontologies/__init__.py +4 -0
  10. cognee/api/v1/ontologies/ontologies.py +158 -0
  11. cognee/api/v1/ontologies/routers/__init__.py +0 -0
  12. cognee/api/v1/ontologies/routers/get_ontology_router.py +109 -0
  13. cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
  14. cognee/api/v1/search/search.py +4 -0
  15. cognee/api/v1/ui/node_setup.py +360 -0
  16. cognee/api/v1/ui/npm_utils.py +50 -0
  17. cognee/api/v1/ui/ui.py +38 -68
  18. cognee/cli/commands/cognify_command.py +8 -1
  19. cognee/cli/config.py +1 -1
  20. cognee/context_global_variables.py +86 -9
  21. cognee/eval_framework/Dockerfile +29 -0
  22. cognee/eval_framework/answer_generation/answer_generation_executor.py +10 -0
  23. cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
  24. cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +0 -2
  25. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
  26. cognee/eval_framework/eval_config.py +2 -2
  27. cognee/eval_framework/modal_run_eval.py +16 -28
  28. cognee/infrastructure/databases/cache/config.py +3 -1
  29. cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
  30. cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
  31. cognee/infrastructure/databases/dataset_database_handler/__init__.py +3 -0
  32. cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +80 -0
  33. cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +18 -0
  34. cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +10 -0
  35. cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
  36. cognee/infrastructure/databases/graph/config.py +7 -0
  37. cognee/infrastructure/databases/graph/get_graph_engine.py +3 -0
  38. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -0
  39. cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +81 -0
  40. cognee/infrastructure/databases/graph/kuzu/adapter.py +228 -0
  41. cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +168 -0
  42. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +80 -1
  43. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
  44. cognee/infrastructure/databases/utils/__init__.py +3 -0
  45. cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +10 -0
  46. cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +66 -18
  47. cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +10 -0
  48. cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +30 -0
  49. cognee/infrastructure/databases/vector/config.py +5 -0
  50. cognee/infrastructure/databases/vector/create_vector_engine.py +6 -1
  51. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +8 -6
  52. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +9 -7
  53. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -10
  54. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +2 -0
  55. cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +50 -0
  56. cognee/infrastructure/databases/vector/vector_db_interface.py +35 -0
  57. cognee/infrastructure/engine/models/Edge.py +13 -1
  58. cognee/infrastructure/files/storage/s3_config.py +2 -0
  59. cognee/infrastructure/files/utils/guess_file_type.py +4 -0
  60. cognee/infrastructure/llm/LLMGateway.py +5 -2
  61. cognee/infrastructure/llm/config.py +37 -0
  62. cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +23 -8
  64. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +22 -18
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +5 -0
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +153 -0
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +47 -38
  68. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +46 -37
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +20 -10
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +23 -11
  71. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +36 -23
  72. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +47 -36
  73. cognee/infrastructure/loaders/LoaderEngine.py +1 -0
  74. cognee/infrastructure/loaders/core/__init__.py +2 -1
  75. cognee/infrastructure/loaders/core/csv_loader.py +93 -0
  76. cognee/infrastructure/loaders/core/text_loader.py +1 -2
  77. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
  78. cognee/infrastructure/loaders/supported_loaders.py +2 -1
  79. cognee/memify_pipelines/create_triplet_embeddings.py +53 -0
  80. cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
  81. cognee/modules/chunking/CsvChunker.py +35 -0
  82. cognee/modules/chunking/models/DocumentChunk.py +2 -1
  83. cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
  84. cognee/modules/cognify/config.py +2 -0
  85. cognee/modules/data/deletion/prune_system.py +52 -2
  86. cognee/modules/data/methods/__init__.py +1 -0
  87. cognee/modules/data/methods/create_dataset.py +4 -2
  88. cognee/modules/data/methods/delete_dataset.py +26 -0
  89. cognee/modules/data/methods/get_dataset_ids.py +5 -1
  90. cognee/modules/data/methods/get_unique_data_id.py +68 -0
  91. cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
  92. cognee/modules/data/models/Dataset.py +2 -0
  93. cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
  94. cognee/modules/data/processing/document_types/__init__.py +1 -0
  95. cognee/modules/engine/models/Triplet.py +9 -0
  96. cognee/modules/engine/models/__init__.py +1 -0
  97. cognee/modules/graph/cognee_graph/CogneeGraph.py +89 -39
  98. cognee/modules/graph/cognee_graph/CogneeGraphElements.py +8 -3
  99. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
  100. cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
  101. cognee/modules/ingestion/identify.py +4 -4
  102. cognee/modules/memify/memify.py +1 -7
  103. cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
  104. cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
  105. cognee/modules/pipelines/operations/pipeline.py +18 -2
  106. cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
  107. cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
  108. cognee/modules/retrieval/__init__.py +1 -1
  109. cognee/modules/retrieval/base_graph_retriever.py +7 -3
  110. cognee/modules/retrieval/base_retriever.py +7 -3
  111. cognee/modules/retrieval/completion_retriever.py +11 -4
  112. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +10 -2
  113. cognee/modules/retrieval/graph_completion_cot_retriever.py +18 -51
  114. cognee/modules/retrieval/graph_completion_retriever.py +14 -1
  115. cognee/modules/retrieval/graph_summary_completion_retriever.py +4 -0
  116. cognee/modules/retrieval/register_retriever.py +10 -0
  117. cognee/modules/retrieval/registered_community_retrievers.py +1 -0
  118. cognee/modules/retrieval/temporal_retriever.py +13 -2
  119. cognee/modules/retrieval/triplet_retriever.py +182 -0
  120. cognee/modules/retrieval/utils/brute_force_triplet_search.py +43 -11
  121. cognee/modules/retrieval/utils/completion.py +2 -22
  122. cognee/modules/run_custom_pipeline/__init__.py +1 -0
  123. cognee/modules/run_custom_pipeline/run_custom_pipeline.py +76 -0
  124. cognee/modules/search/methods/get_search_type_tools.py +54 -8
  125. cognee/modules/search/methods/no_access_control_search.py +4 -0
  126. cognee/modules/search/methods/search.py +26 -3
  127. cognee/modules/search/types/SearchType.py +1 -1
  128. cognee/modules/settings/get_settings.py +19 -0
  129. cognee/modules/users/methods/create_user.py +12 -27
  130. cognee/modules/users/methods/get_authenticated_user.py +3 -2
  131. cognee/modules/users/methods/get_default_user.py +4 -2
  132. cognee/modules/users/methods/get_user.py +1 -1
  133. cognee/modules/users/methods/get_user_by_email.py +1 -1
  134. cognee/modules/users/models/DatasetDatabase.py +24 -3
  135. cognee/modules/users/models/Tenant.py +6 -7
  136. cognee/modules/users/models/User.py +6 -5
  137. cognee/modules/users/models/UserTenant.py +12 -0
  138. cognee/modules/users/models/__init__.py +1 -0
  139. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
  140. cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
  141. cognee/modules/users/tenants/methods/__init__.py +1 -0
  142. cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
  143. cognee/modules/users/tenants/methods/create_tenant.py +22 -8
  144. cognee/modules/users/tenants/methods/select_tenant.py +62 -0
  145. cognee/shared/logging_utils.py +6 -0
  146. cognee/shared/rate_limiting.py +30 -0
  147. cognee/tasks/chunks/__init__.py +1 -0
  148. cognee/tasks/chunks/chunk_by_row.py +94 -0
  149. cognee/tasks/documents/__init__.py +0 -1
  150. cognee/tasks/documents/classify_documents.py +2 -0
  151. cognee/tasks/feedback/generate_improved_answers.py +3 -3
  152. cognee/tasks/graph/extract_graph_from_data.py +9 -10
  153. cognee/tasks/ingestion/ingest_data.py +1 -1
  154. cognee/tasks/memify/__init__.py +2 -0
  155. cognee/tasks/memify/cognify_session.py +41 -0
  156. cognee/tasks/memify/extract_user_sessions.py +73 -0
  157. cognee/tasks/memify/get_triplet_datapoints.py +289 -0
  158. cognee/tasks/storage/add_data_points.py +142 -2
  159. cognee/tasks/storage/index_data_points.py +33 -22
  160. cognee/tasks/storage/index_graph_edges.py +37 -57
  161. cognee/tests/integration/documents/CsvDocument_test.py +70 -0
  162. cognee/tests/integration/retrieval/test_triplet_retriever.py +84 -0
  163. cognee/tests/integration/tasks/test_add_data_points.py +139 -0
  164. cognee/tests/integration/tasks/test_get_triplet_datapoints.py +69 -0
  165. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +1 -1
  166. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +1 -1
  167. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +13 -27
  168. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
  169. cognee/tests/test_add_docling_document.py +2 -2
  170. cognee/tests/test_cognee_server_start.py +84 -3
  171. cognee/tests/test_conversation_history.py +68 -5
  172. cognee/tests/test_data/example_with_header.csv +3 -0
  173. cognee/tests/test_dataset_database_handler.py +137 -0
  174. cognee/tests/test_dataset_delete.py +76 -0
  175. cognee/tests/test_edge_centered_payload.py +170 -0
  176. cognee/tests/test_edge_ingestion.py +27 -0
  177. cognee/tests/test_feedback_enrichment.py +1 -1
  178. cognee/tests/test_library.py +6 -4
  179. cognee/tests/test_load.py +62 -0
  180. cognee/tests/test_multi_tenancy.py +165 -0
  181. cognee/tests/test_parallel_databases.py +2 -0
  182. cognee/tests/test_pipeline_cache.py +164 -0
  183. cognee/tests/test_relational_db_migration.py +54 -2
  184. cognee/tests/test_search_db.py +44 -2
  185. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
  186. cognee/tests/unit/api/test_ontology_endpoint.py +252 -0
  187. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
  188. cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
  189. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
  190. cognee/tests/unit/infrastructure/llm/test_llm_config.py +46 -0
  191. cognee/tests/unit/infrastructure/mock_embedding_engine.py +3 -7
  192. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +0 -5
  193. cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
  194. cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
  195. cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
  196. cognee/tests/unit/modules/graph/cognee_graph_test.py +406 -0
  197. cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
  198. cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
  199. cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +214 -0
  200. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
  201. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
  202. cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
  203. cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
  204. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
  205. cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +608 -0
  206. cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +83 -0
  207. cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
  208. cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
  209. cognee/tests/unit/tasks/storage/test_add_data_points.py +288 -0
  210. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/METADATA +11 -6
  211. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/RECORD +215 -163
  212. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/WHEEL +1 -1
  213. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/entry_points.txt +0 -1
  214. cognee/api/v1/cognify/code_graph_pipeline.py +0 -119
  215. cognee/api/v1/cognify/routers/get_code_pipeline_router.py +0 -90
  216. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +0 -544
  217. cognee/modules/retrieval/code_retriever.py +0 -232
  218. cognee/tasks/code/enrich_dependency_graph_checker.py +0 -35
  219. cognee/tasks/code/get_local_dependencies_checker.py +0 -20
  220. cognee/tasks/code/get_repo_dependency_graph_checker.py +0 -35
  221. cognee/tasks/documents/check_permissions_on_dataset.py +0 -26
  222. cognee/tasks/repo_processor/__init__.py +0 -2
  223. cognee/tasks/repo_processor/get_local_dependencies.py +0 -335
  224. cognee/tasks/repo_processor/get_non_code_files.py +0 -158
  225. cognee/tasks/repo_processor/get_repo_file_dependencies.py +0 -243
  226. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/LICENSE +0 -0
  227. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/NOTICE.md +0 -0
@@ -8,7 +8,7 @@ from neo4j import AsyncSession
8
8
  from neo4j import AsyncGraphDatabase
9
9
  from neo4j.exceptions import Neo4jError
10
10
  from contextlib import asynccontextmanager
11
- from typing import Optional, Any, List, Dict, Type, Tuple
11
+ from typing import Optional, Any, List, Dict, Type, Tuple, Coroutine
12
12
 
13
13
  from cognee.infrastructure.engine import DataPoint
14
14
  from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int
@@ -964,6 +964,63 @@ class Neo4jAdapter(GraphDBInterface):
964
964
  logger.error(f"Error during graph data retrieval: {str(e)}")
965
965
  raise
966
966
 
967
+ async def get_id_filtered_graph_data(self, target_ids: list[str]):
968
+ """
969
+ Retrieve graph data filtered by specific node IDs, including their direct neighbors
970
+ and only edges where one endpoint matches those IDs.
971
+
972
+ This version uses a single Cypher query for efficiency.
973
+ """
974
+ import time
975
+
976
+ start_time = time.time()
977
+
978
+ try:
979
+ if not target_ids:
980
+ logger.warning("No target IDs provided for ID-filtered graph retrieval.")
981
+ return [], []
982
+
983
+ query = """
984
+ MATCH ()-[r]-()
985
+ WHERE startNode(r).id IN $target_ids
986
+ OR endNode(r).id IN $target_ids
987
+ WITH DISTINCT r, startNode(r) AS a, endNode(r) AS b
988
+ RETURN
989
+ properties(a) AS n_properties,
990
+ properties(b) AS m_properties,
991
+ type(r) AS type,
992
+ properties(r) AS properties
993
+ """
994
+
995
+ result = await self.query(query, {"target_ids": target_ids})
996
+
997
+ nodes_dict = {}
998
+ edges = []
999
+
1000
+ for record in result:
1001
+ n_props = record["n_properties"]
1002
+ m_props = record["m_properties"]
1003
+ r_props = record["properties"]
1004
+ r_type = record["type"]
1005
+
1006
+ nodes_dict[n_props["id"]] = (n_props["id"], n_props)
1007
+ nodes_dict[m_props["id"]] = (m_props["id"], m_props)
1008
+
1009
+ source_id = r_props.get("source_node_id", n_props["id"])
1010
+ target_id = r_props.get("target_node_id", m_props["id"])
1011
+ edges.append((source_id, target_id, r_type, r_props))
1012
+
1013
+ retrieval_time = time.time() - start_time
1014
+ logger.info(
1015
+ f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s"
1016
+ )
1017
+
1018
+ return list(nodes_dict.values()), edges
1019
+
1020
+ except Exception as e:
1021
+ logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}")
1022
+ raise
1023
+
967
1024
  async def get_nodeset_subgraph(
968
1025
  self, node_type: Type[Any], node_name: List[str]
969
1026
  ) -> Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]]:
@@ -1470,3 +1527,25 @@ class Neo4jAdapter(GraphDBInterface):
1470
1527
  time_ids_list = [item["id"] for item in time_nodes if "id" in item]
1471
1528
 
1472
1529
  return ", ".join(f"'{uid}'" for uid in time_ids_list)
1530
+
1531
+ async def get_triplets_batch(self, offset: int, limit: int) -> list[dict[str, Any]]:
1532
+ """
1533
+ Retrieve a batch of triplets (start_node, relationship, end_node) from the graph.
1534
+
1535
+ Parameters:
1536
+ -----------
1537
+ - offset (int): Number of triplets to skip before returning results.
1538
+ - limit (int): Maximum number of triplets to return.
1539
+
1540
+ Returns:
1541
+ --------
1542
+ - list[dict[str, Any]]: A list of triplets.
1543
+ """
1544
+ query = f"""
1545
+ MATCH (start_node:`{BASE_LABEL}`)-[relationship]->(end_node:`{BASE_LABEL}`)
1546
+ RETURN start_node, properties(relationship) AS relationship_properties, end_node
1547
+ SKIP $offset LIMIT $limit
1548
+ """
1549
+ results = await self.query(query, {"offset": offset, "limit": limit})
1550
+
1551
+ return results
@@ -416,6 +416,15 @@ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface):
416
416
  self._client.query(f"MATCH (n :{self._VECTOR_NODE_LABEL}) DETACH DELETE n")
417
417
  pass
418
418
 
419
+ async def is_empty(self) -> bool:
420
+ query = """
421
+ MATCH (n)
422
+ RETURN true
423
+ LIMIT 1;
424
+ """
425
+ query_result = await self._client.query(query)
426
+ return len(query_result) == 0
427
+
419
428
  @staticmethod
420
429
  def _get_scored_result(
421
430
  item: dict, with_vector: bool = False, with_score: bool = False
@@ -1 +1,4 @@
1
1
  from .get_or_create_dataset_database import get_or_create_dataset_database
2
+ from .resolve_dataset_database_connection_info import resolve_dataset_database_connection_info
3
+ from .get_graph_dataset_database_handler import get_graph_dataset_database_handler
4
+ from .get_vector_dataset_database_handler import get_vector_dataset_database_handler
@@ -0,0 +1,10 @@
1
+ from cognee.modules.users.models.DatasetDatabase import DatasetDatabase
2
+
3
+
4
+ def get_graph_dataset_database_handler(dataset_database: DatasetDatabase) -> dict:
5
+ from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
6
+ supported_dataset_database_handlers,
7
+ )
8
+
9
+ handler = supported_dataset_database_handlers[dataset_database.graph_dataset_database_handler]
10
+ return handler
@@ -1,16 +1,65 @@
1
1
  from uuid import UUID
2
- from typing import Union
2
+ from typing import Union, Optional
3
3
 
4
4
  from sqlalchemy import select
5
5
  from sqlalchemy.exc import IntegrityError
6
- from cognee.modules.data.methods import create_dataset
7
6
 
7
+ from cognee.modules.data.methods import create_dataset
8
8
  from cognee.infrastructure.databases.relational import get_relational_engine
9
+ from cognee.infrastructure.databases.vector import get_vectordb_config
10
+ from cognee.infrastructure.databases.graph.config import get_graph_config
9
11
  from cognee.modules.data.methods import get_unique_dataset_id
10
12
  from cognee.modules.users.models import DatasetDatabase
11
13
  from cognee.modules.users.models import User
12
14
 
13
15
 
16
+ async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict:
17
+ vector_config = get_vectordb_config()
18
+
19
+ from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
20
+ supported_dataset_database_handlers,
21
+ )
22
+
23
+ handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler]
24
+ return await handler["handler_instance"].create_dataset(dataset_id, user)
25
+
26
+
27
+ async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict:
28
+ graph_config = get_graph_config()
29
+
30
+ from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
31
+ supported_dataset_database_handlers,
32
+ )
33
+
34
+ handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler]
35
+ return await handler["handler_instance"].create_dataset(dataset_id, user)
36
+
37
+
38
+ async def _existing_dataset_database(
39
+ dataset_id: UUID,
40
+ user: User,
41
+ ) -> Optional[DatasetDatabase]:
42
+ """
43
+ Check if a DatasetDatabase row already exists for the given owner + dataset.
44
+ Return None if it doesn't exist, return the row if it does.
45
+ Args:
46
+ dataset_id:
47
+ user:
48
+
49
+ Returns:
50
+ DatasetDatabase or None
51
+ """
52
+ db_engine = get_relational_engine()
53
+
54
+ async with db_engine.get_async_session() as session:
55
+ stmt = select(DatasetDatabase).where(
56
+ DatasetDatabase.owner_id == user.id,
57
+ DatasetDatabase.dataset_id == dataset_id,
58
+ )
59
+ existing: DatasetDatabase = await session.scalar(stmt)
60
+ return existing
61
+
62
+
14
63
  async def get_or_create_dataset_database(
15
64
  dataset: Union[str, UUID],
16
65
  user: User,
@@ -21,6 +70,8 @@ async def get_or_create_dataset_database(
21
70
  • If the row already exists, it is fetched and returned.
22
71
  • Otherwise a new one is created atomically and returned.
23
72
 
73
+ DatasetDatabase row contains connection and provider info for vector and graph databases.
74
+
24
75
  Parameters
25
76
  ----------
26
77
  user : User
@@ -32,29 +83,26 @@ async def get_or_create_dataset_database(
32
83
 
33
84
  dataset_id = await get_unique_dataset_id(dataset, user)
34
85
 
35
- vector_db_name = f"{dataset_id}.lance.db"
36
- graph_db_name = f"{dataset_id}.pkl"
86
+ # If dataset is given as name make sure the dataset is created first
87
+ if isinstance(dataset, str):
88
+ async with db_engine.get_async_session() as session:
89
+ await create_dataset(dataset, user, session)
37
90
 
38
- async with db_engine.get_async_session() as session:
39
- # Create dataset if it doesn't exist
40
- if isinstance(dataset, str):
41
- dataset = await create_dataset(dataset, user, session)
91
+ # If dataset database already exists return it
92
+ existing_dataset_database = await _existing_dataset_database(dataset_id, user)
93
+ if existing_dataset_database:
94
+ return existing_dataset_database
42
95
 
43
- # Try to fetch an existing row first
44
- stmt = select(DatasetDatabase).where(
45
- DatasetDatabase.owner_id == user.id,
46
- DatasetDatabase.dataset_id == dataset_id,
47
- )
48
- existing: DatasetDatabase = await session.scalar(stmt)
49
- if existing:
50
- return existing
96
+ graph_config_dict = await _get_graph_db_info(dataset_id, user)
97
+ vector_config_dict = await _get_vector_db_info(dataset_id, user)
51
98
 
99
+ async with db_engine.get_async_session() as session:
52
100
  # If there are no existing rows build a new row
53
101
  record = DatasetDatabase(
54
102
  owner_id=user.id,
55
103
  dataset_id=dataset_id,
56
- vector_database_name=vector_db_name,
57
- graph_database_name=graph_db_name,
104
+ **graph_config_dict, # Unpack graph db config
105
+ **vector_config_dict, # Unpack vector db config
58
106
  )
59
107
 
60
108
  try:
@@ -0,0 +1,10 @@
1
+ from cognee.modules.users.models.DatasetDatabase import DatasetDatabase
2
+
3
+
4
+ def get_vector_dataset_database_handler(dataset_database: DatasetDatabase) -> dict:
5
+ from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
6
+ supported_dataset_database_handlers,
7
+ )
8
+
9
+ handler = supported_dataset_database_handlers[dataset_database.vector_dataset_database_handler]
10
+ return handler
@@ -0,0 +1,30 @@
1
+ from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import (
2
+ get_graph_dataset_database_handler,
3
+ )
4
+ from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import (
5
+ get_vector_dataset_database_handler,
6
+ )
7
+ from cognee.modules.users.models.DatasetDatabase import DatasetDatabase
8
+
9
+
10
+ async def resolve_dataset_database_connection_info(
11
+ dataset_database: DatasetDatabase,
12
+ ) -> DatasetDatabase:
13
+ """
14
+ Resolve the connection info for the given DatasetDatabase instance.
15
+ Resolve both vector and graph database connection info and return the updated DatasetDatabase instance.
16
+
17
+ Args:
18
+ dataset_database: DatasetDatabase instance
19
+ Returns:
20
+ DatasetDatabase instance with resolved connection info
21
+ """
22
+ vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database)
23
+ graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database)
24
+ dataset_database = await vector_dataset_database_handler[
25
+ "handler_instance"
26
+ ].resolve_dataset_connection_info(dataset_database)
27
+ dataset_database = await graph_dataset_database_handler[
28
+ "handler_instance"
29
+ ].resolve_dataset_connection_info(dataset_database)
30
+ return dataset_database
@@ -18,14 +18,17 @@ class VectorConfig(BaseSettings):
18
18
  Instance variables:
19
19
  - vector_db_url: The URL of the vector database.
20
20
  - vector_db_port: The port for the vector database.
21
+ - vector_db_name: The name of the vector database.
21
22
  - vector_db_key: The key for accessing the vector database.
22
23
  - vector_db_provider: The provider for the vector database.
23
24
  """
24
25
 
25
26
  vector_db_url: str = ""
26
27
  vector_db_port: int = 1234
28
+ vector_db_name: str = ""
27
29
  vector_db_key: str = ""
28
30
  vector_db_provider: str = "lancedb"
31
+ vector_dataset_database_handler: str = "lancedb"
29
32
 
30
33
  model_config = SettingsConfigDict(env_file=".env", extra="allow")
31
34
 
@@ -58,8 +61,10 @@ class VectorConfig(BaseSettings):
58
61
  return {
59
62
  "vector_db_url": self.vector_db_url,
60
63
  "vector_db_port": self.vector_db_port,
64
+ "vector_db_name": self.vector_db_name,
61
65
  "vector_db_key": self.vector_db_key,
62
66
  "vector_db_provider": self.vector_db_provider,
67
+ "vector_dataset_database_handler": self.vector_dataset_database_handler,
63
68
  }
64
69
 
65
70
 
@@ -1,5 +1,6 @@
1
1
  from .supported_databases import supported_databases
2
2
  from .embeddings import get_embedding_engine
3
+ from cognee.infrastructure.databases.graph.config import get_graph_context_config
3
4
 
4
5
  from functools import lru_cache
5
6
 
@@ -8,8 +9,10 @@ from functools import lru_cache
8
9
  def create_vector_engine(
9
10
  vector_db_provider: str,
10
11
  vector_db_url: str,
12
+ vector_db_name: str,
11
13
  vector_db_port: str = "",
12
14
  vector_db_key: str = "",
15
+ vector_dataset_database_handler: str = "",
13
16
  ):
14
17
  """
15
18
  Create a vector database engine based on the specified provider.
@@ -27,6 +30,7 @@ def create_vector_engine(
27
30
  - vector_db_url (str): The URL for the vector database instance.
28
31
  - vector_db_port (str): The port for the vector database instance. Required for some
29
32
  providers.
33
+ - vector_db_name (str): The name of the vector database instance.
30
34
  - vector_db_key (str): The API key or access token for the vector database instance.
31
35
  - vector_db_provider (str): The name of the vector database provider to use (e.g.,
32
36
  'pgvector').
@@ -45,6 +49,7 @@ def create_vector_engine(
45
49
  url=vector_db_url,
46
50
  api_key=vector_db_key,
47
51
  embedding_engine=embedding_engine,
52
+ database_name=vector_db_name,
48
53
  )
49
54
 
50
55
  if vector_db_provider.lower() == "pgvector":
@@ -133,6 +138,6 @@ def create_vector_engine(
133
138
 
134
139
  else:
135
140
  raise EnvironmentError(
136
- f"Unsupported graph database provider: {vector_db_provider}. "
141
+ f"Unsupported vector database provider: {vector_db_provider}. "
137
142
  f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}"
138
143
  )
@@ -17,6 +17,7 @@ from cognee.infrastructure.databases.exceptions import EmbeddingException
17
17
  from cognee.infrastructure.llm.tokenizer.TikToken import (
18
18
  TikTokenTokenizer,
19
19
  )
20
+ from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
20
21
 
21
22
  litellm.set_verbose = False
22
23
  logger = get_logger("FastembedEmbeddingEngine")
@@ -68,7 +69,7 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
68
69
 
69
70
  @retry(
70
71
  stop=stop_after_delay(128),
71
- wait=wait_exponential_jitter(2, 128),
72
+ wait=wait_exponential_jitter(8, 128),
72
73
  retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
73
74
  before_sleep=before_sleep_log(logger, logging.DEBUG),
74
75
  reraise=True,
@@ -96,11 +97,12 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
96
97
  if self.mock:
97
98
  return [[0.0] * self.dimensions for _ in text]
98
99
  else:
99
- embeddings = self.embedding_model.embed(
100
- text,
101
- batch_size=len(text),
102
- parallel=None,
103
- )
100
+ async with embedding_rate_limiter_context_manager():
101
+ embeddings = self.embedding_model.embed(
102
+ text,
103
+ batch_size=len(text),
104
+ parallel=None,
105
+ )
104
106
 
105
107
  return list(embeddings)
106
108
 
@@ -25,6 +25,7 @@ from cognee.infrastructure.llm.tokenizer.Mistral import (
25
25
  from cognee.infrastructure.llm.tokenizer.TikToken import (
26
26
  TikTokenTokenizer,
27
27
  )
28
+ from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
28
29
 
29
30
  litellm.set_verbose = False
30
31
  logger = get_logger("LiteLLMEmbeddingEngine")
@@ -109,13 +110,14 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
109
110
  response = {"data": [{"embedding": [0.0] * self.dimensions} for _ in text]}
110
111
  return [data["embedding"] for data in response["data"]]
111
112
  else:
112
- response = await litellm.aembedding(
113
- model=self.model,
114
- input=text,
115
- api_key=self.api_key,
116
- api_base=self.endpoint,
117
- api_version=self.api_version,
118
- )
113
+ async with embedding_rate_limiter_context_manager():
114
+ response = await litellm.aembedding(
115
+ model=self.model,
116
+ input=text,
117
+ api_key=self.api_key,
118
+ api_base=self.endpoint,
119
+ api_version=self.api_version,
120
+ )
119
121
 
120
122
  return [data["embedding"] for data in response.data]
121
123
 
@@ -18,10 +18,7 @@ from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import Em
18
18
  from cognee.infrastructure.llm.tokenizer.HuggingFace import (
19
19
  HuggingFaceTokenizer,
20
20
  )
21
- from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
22
- embedding_rate_limit_async,
23
- embedding_sleep_and_retry_async,
24
- )
21
+ from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
25
22
  from cognee.shared.utils import create_secure_ssl_context
26
23
 
27
24
  logger = get_logger("OllamaEmbeddingEngine")
@@ -101,7 +98,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
101
98
 
102
99
  @retry(
103
100
  stop=stop_after_delay(128),
104
- wait=wait_exponential_jitter(2, 128),
101
+ wait=wait_exponential_jitter(8, 128),
105
102
  retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
106
103
  before_sleep=before_sleep_log(logger, logging.DEBUG),
107
104
  reraise=True,
@@ -120,11 +117,15 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
120
117
  ssl_context = create_secure_ssl_context()
121
118
  connector = aiohttp.TCPConnector(ssl=ssl_context)
122
119
  async with aiohttp.ClientSession(connector=connector) as session:
123
- async with session.post(
124
- self.endpoint, json=payload, headers=headers, timeout=60.0
125
- ) as response:
126
- data = await response.json()
127
- return data["embeddings"][0]
120
+ async with embedding_rate_limiter_context_manager():
121
+ async with session.post(
122
+ self.endpoint, json=payload, headers=headers, timeout=60.0
123
+ ) as response:
124
+ data = await response.json()
125
+ if "embeddings" in data:
126
+ return data["embeddings"][0]
127
+ else:
128
+ return data["data"][0]["embedding"]
128
129
 
129
130
  def get_vector_size(self) -> int:
130
131
  """
@@ -193,6 +193,8 @@ class LanceDBAdapter(VectorDBInterface):
193
193
  for (data_point_index, data_point) in enumerate(data_points)
194
194
  ]
195
195
 
196
+ lance_data_points = list({dp.id: dp for dp in lance_data_points}.values())
197
+
196
198
  async with self.VECTOR_DB_LOCK:
197
199
  await (
198
200
  collection.merge_insert("id")
@@ -0,0 +1,50 @@
1
+ import os
2
+ from uuid import UUID
3
+ from typing import Optional
4
+
5
+ from cognee.infrastructure.databases.vector.create_vector_engine import create_vector_engine
6
+ from cognee.modules.users.models import User
7
+ from cognee.modules.users.models import DatasetDatabase
8
+ from cognee.base_config import get_base_config
9
+ from cognee.infrastructure.databases.vector import get_vectordb_config
10
+ from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface
11
+
12
+
13
+ class LanceDBDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):
14
+ """
15
+ Handler for interacting with LanceDB Dataset databases.
16
+ """
17
+
18
+ @classmethod
19
+ async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict:
20
+ vector_config = get_vectordb_config()
21
+ base_config = get_base_config()
22
+
23
+ if vector_config.vector_db_provider != "lancedb":
24
+ raise ValueError(
25
+ "LanceDBDatasetDatabaseHandler can only be used with LanceDB vector database provider."
26
+ )
27
+
28
+ databases_directory_path = os.path.join(
29
+ base_config.system_root_directory, "databases", str(user.id)
30
+ )
31
+
32
+ vector_db_name = f"{dataset_id}.lance.db"
33
+
34
+ return {
35
+ "vector_database_provider": vector_config.vector_db_provider,
36
+ "vector_database_url": os.path.join(databases_directory_path, vector_db_name),
37
+ "vector_database_key": vector_config.vector_db_key,
38
+ "vector_database_name": vector_db_name,
39
+ "vector_dataset_database_handler": "lancedb",
40
+ }
41
+
42
+ @classmethod
43
+ async def delete_dataset(cls, dataset_database: DatasetDatabase):
44
+ vector_engine = create_vector_engine(
45
+ vector_db_provider=dataset_database.vector_database_provider,
46
+ vector_db_url=dataset_database.vector_database_url,
47
+ vector_db_key=dataset_database.vector_database_key,
48
+ vector_db_name=dataset_database.vector_database_name,
49
+ )
50
+ await vector_engine.prune()
@@ -2,6 +2,8 @@ from typing import List, Protocol, Optional, Union, Any
2
2
  from abc import abstractmethod
3
3
  from cognee.infrastructure.engine import DataPoint
4
4
  from .models.PayloadSchema import PayloadSchema
5
+ from uuid import UUID
6
+ from cognee.modules.users.models import User
5
7
 
6
8
 
7
9
  class VectorDBInterface(Protocol):
@@ -217,3 +219,36 @@ class VectorDBInterface(Protocol):
217
219
  - Any: The schema object suitable for this vector database
218
220
  """
219
221
  return model_type
222
+
223
+ @classmethod
224
+ async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict:
225
+ """
226
+ Return a dictionary with connection info for a vector database for the given dataset.
227
+ Function can auto handle deploying of the actual database if needed, but is not necessary.
228
+ Only providing connection info is sufficient, this info will be mapped when trying to connect to the provided dataset in the future.
229
+ Needed for Cognee multi-tenant/multi-user and backend access control support.
230
+
231
+ Dictionary returned from this function will be used to create a DatasetDatabase row in the relational database.
232
+ From which internal mapping of dataset -> database connection info will be done.
233
+
234
+ Each dataset needs to map to a unique vector database when backend access control is enabled to facilitate a separation of concern for data.
235
+
236
+ Args:
237
+ dataset_id: UUID of the dataset if needed by the database creation logic
238
+ user: User object if needed by the database creation logic
239
+ Returns:
240
+ dict: Connection info for the created vector database instance.
241
+ """
242
+ pass
243
+
244
+ async def delete_dataset(self, dataset_id: UUID, user: User) -> None:
245
+ """
246
+ Delete the vector database for the given dataset.
247
+ Function should auto handle deleting of the actual database or send a request to the proper service to delete the database.
248
+ Needed for maintaining a database for Cognee multi-tenant/multi-user and backend access control.
249
+
250
+ Args:
251
+ dataset_id: UUID of the dataset
252
+ user: User object
253
+ """
254
+ pass
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel
1
+ from pydantic import BaseModel, field_validator
2
2
  from typing import Optional, Any, Dict
3
3
 
4
4
 
@@ -18,9 +18,21 @@ class Edge(BaseModel):
18
18
 
19
19
  # Mixed usage
20
20
  has_items: (Edge(weight=0.5, weights={"confidence": 0.9}), list[Item])
21
+
22
+ # With edge_text for rich embedding representation
23
+ contains: (Edge(relationship_type="contains", edge_text="relationship_name: contains; entity_description: Alice"), Entity)
21
24
  """
22
25
 
23
26
  weight: Optional[float] = None
24
27
  weights: Optional[Dict[str, float]] = None
25
28
  relationship_type: Optional[str] = None
26
29
  properties: Optional[Dict[str, Any]] = None
30
+ edge_text: Optional[str] = None
31
+
32
+ @field_validator("edge_text", mode="before")
33
+ @classmethod
34
+ def ensure_edge_text(cls, v, info):
35
+ """Auto-populate edge_text from relationship_type if not explicitly provided."""
36
+ if v is None and info.data.get("relationship_type"):
37
+ return info.data["relationship_type"]
38
+ return v
@@ -9,6 +9,8 @@ class S3Config(BaseSettings):
9
9
  aws_access_key_id: Optional[str] = None
10
10
  aws_secret_access_key: Optional[str] = None
11
11
  aws_session_token: Optional[str] = None
12
+ aws_profile_name: Optional[str] = None
13
+ aws_bedrock_runtime_endpoint: Optional[str] = None
12
14
  model_config = SettingsConfigDict(env_file=".env", extra="allow")
13
15
 
14
16
 
@@ -55,6 +55,10 @@ def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type
55
55
  file_type = Type("text/plain", "txt")
56
56
  return file_type
57
57
 
58
+ if ext in [".csv"]:
59
+ file_type = Type("text/csv", "csv")
60
+ return file_type
61
+
58
62
  file_type = filetype.guess(file)
59
63
 
60
64
  # If file type could not be determined consider it a plain text file as they don't have magic number encoding
@@ -11,7 +11,7 @@ class LLMGateway:
11
11
 
12
12
  @staticmethod
13
13
  def acreate_structured_output(
14
- text_input: str, system_prompt: str, response_model: Type[BaseModel]
14
+ text_input: str, system_prompt: str, response_model: Type[BaseModel], **kwargs
15
15
  ) -> Coroutine:
16
16
  llm_config = get_llm_config()
17
17
  if llm_config.structured_output_framework.upper() == "BAML":
@@ -31,7 +31,10 @@ class LLMGateway:
31
31
 
32
32
  llm_client = get_llm_client()
33
33
  return llm_client.acreate_structured_output(
34
- text_input=text_input, system_prompt=system_prompt, response_model=response_model
34
+ text_input=text_input,
35
+ system_prompt=system_prompt,
36
+ response_model=response_model,
37
+ **kwargs,
35
38
  )
36
39
 
37
40
  @staticmethod