cognee 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/client.py +9 -5
  3. cognee/api/v1/add/add.py +2 -1
  4. cognee/api/v1/add/routers/get_add_router.py +3 -1
  5. cognee/api/v1/cognify/cognify.py +24 -16
  6. cognee/api/v1/cognify/routers/__init__.py +0 -1
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +30 -1
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
  9. cognee/api/v1/ontologies/__init__.py +4 -0
  10. cognee/api/v1/ontologies/ontologies.py +158 -0
  11. cognee/api/v1/ontologies/routers/__init__.py +0 -0
  12. cognee/api/v1/ontologies/routers/get_ontology_router.py +109 -0
  13. cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
  14. cognee/api/v1/search/search.py +4 -0
  15. cognee/api/v1/ui/node_setup.py +360 -0
  16. cognee/api/v1/ui/npm_utils.py +50 -0
  17. cognee/api/v1/ui/ui.py +38 -68
  18. cognee/cli/commands/cognify_command.py +8 -1
  19. cognee/cli/config.py +1 -1
  20. cognee/context_global_variables.py +86 -9
  21. cognee/eval_framework/Dockerfile +29 -0
  22. cognee/eval_framework/answer_generation/answer_generation_executor.py +10 -0
  23. cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
  24. cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +0 -2
  25. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
  26. cognee/eval_framework/eval_config.py +2 -2
  27. cognee/eval_framework/modal_run_eval.py +16 -28
  28. cognee/infrastructure/databases/cache/config.py +3 -1
  29. cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
  30. cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
  31. cognee/infrastructure/databases/dataset_database_handler/__init__.py +3 -0
  32. cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +80 -0
  33. cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +18 -0
  34. cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +10 -0
  35. cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
  36. cognee/infrastructure/databases/graph/config.py +7 -0
  37. cognee/infrastructure/databases/graph/get_graph_engine.py +3 -0
  38. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -0
  39. cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +81 -0
  40. cognee/infrastructure/databases/graph/kuzu/adapter.py +228 -0
  41. cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +168 -0
  42. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +80 -1
  43. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
  44. cognee/infrastructure/databases/utils/__init__.py +3 -0
  45. cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +10 -0
  46. cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +66 -18
  47. cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +10 -0
  48. cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +30 -0
  49. cognee/infrastructure/databases/vector/config.py +5 -0
  50. cognee/infrastructure/databases/vector/create_vector_engine.py +6 -1
  51. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +8 -6
  52. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +9 -7
  53. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -13
  54. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +2 -0
  55. cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +50 -0
  56. cognee/infrastructure/databases/vector/vector_db_interface.py +35 -0
  57. cognee/infrastructure/engine/models/Edge.py +13 -1
  58. cognee/infrastructure/files/storage/s3_config.py +2 -0
  59. cognee/infrastructure/files/utils/guess_file_type.py +4 -0
  60. cognee/infrastructure/llm/LLMGateway.py +5 -2
  61. cognee/infrastructure/llm/config.py +37 -0
  62. cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +23 -8
  64. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +22 -18
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +5 -0
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +153 -0
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +47 -38
  68. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +46 -37
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +20 -10
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +23 -11
  71. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +36 -23
  72. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +47 -36
  73. cognee/infrastructure/loaders/LoaderEngine.py +1 -0
  74. cognee/infrastructure/loaders/core/__init__.py +2 -1
  75. cognee/infrastructure/loaders/core/csv_loader.py +93 -0
  76. cognee/infrastructure/loaders/core/text_loader.py +1 -2
  77. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
  78. cognee/infrastructure/loaders/supported_loaders.py +2 -1
  79. cognee/memify_pipelines/create_triplet_embeddings.py +53 -0
  80. cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
  81. cognee/modules/chunking/CsvChunker.py +35 -0
  82. cognee/modules/chunking/models/DocumentChunk.py +2 -1
  83. cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
  84. cognee/modules/cognify/config.py +2 -0
  85. cognee/modules/data/deletion/prune_system.py +52 -2
  86. cognee/modules/data/methods/__init__.py +1 -0
  87. cognee/modules/data/methods/create_dataset.py +4 -2
  88. cognee/modules/data/methods/delete_dataset.py +26 -0
  89. cognee/modules/data/methods/get_dataset_ids.py +5 -1
  90. cognee/modules/data/methods/get_unique_data_id.py +68 -0
  91. cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
  92. cognee/modules/data/models/Dataset.py +2 -0
  93. cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
  94. cognee/modules/data/processing/document_types/__init__.py +1 -0
  95. cognee/modules/engine/models/Triplet.py +9 -0
  96. cognee/modules/engine/models/__init__.py +1 -0
  97. cognee/modules/graph/cognee_graph/CogneeGraph.py +89 -39
  98. cognee/modules/graph/cognee_graph/CogneeGraphElements.py +8 -3
  99. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
  100. cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
  101. cognee/modules/ingestion/identify.py +4 -4
  102. cognee/modules/memify/memify.py +1 -7
  103. cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
  104. cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
  105. cognee/modules/pipelines/operations/pipeline.py +18 -2
  106. cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
  107. cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
  108. cognee/modules/retrieval/__init__.py +1 -1
  109. cognee/modules/retrieval/base_graph_retriever.py +7 -3
  110. cognee/modules/retrieval/base_retriever.py +7 -3
  111. cognee/modules/retrieval/completion_retriever.py +11 -4
  112. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +10 -2
  113. cognee/modules/retrieval/graph_completion_cot_retriever.py +18 -51
  114. cognee/modules/retrieval/graph_completion_retriever.py +14 -1
  115. cognee/modules/retrieval/graph_summary_completion_retriever.py +4 -0
  116. cognee/modules/retrieval/register_retriever.py +10 -0
  117. cognee/modules/retrieval/registered_community_retrievers.py +1 -0
  118. cognee/modules/retrieval/temporal_retriever.py +13 -2
  119. cognee/modules/retrieval/triplet_retriever.py +182 -0
  120. cognee/modules/retrieval/utils/brute_force_triplet_search.py +43 -11
  121. cognee/modules/retrieval/utils/completion.py +2 -22
  122. cognee/modules/run_custom_pipeline/__init__.py +1 -0
  123. cognee/modules/run_custom_pipeline/run_custom_pipeline.py +76 -0
  124. cognee/modules/search/methods/get_search_type_tools.py +54 -8
  125. cognee/modules/search/methods/no_access_control_search.py +4 -0
  126. cognee/modules/search/methods/search.py +26 -3
  127. cognee/modules/search/types/SearchType.py +1 -1
  128. cognee/modules/settings/get_settings.py +19 -0
  129. cognee/modules/users/methods/create_user.py +12 -27
  130. cognee/modules/users/methods/get_authenticated_user.py +3 -2
  131. cognee/modules/users/methods/get_default_user.py +4 -2
  132. cognee/modules/users/methods/get_user.py +1 -1
  133. cognee/modules/users/methods/get_user_by_email.py +1 -1
  134. cognee/modules/users/models/DatasetDatabase.py +24 -3
  135. cognee/modules/users/models/Tenant.py +6 -7
  136. cognee/modules/users/models/User.py +6 -5
  137. cognee/modules/users/models/UserTenant.py +12 -0
  138. cognee/modules/users/models/__init__.py +1 -0
  139. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
  140. cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
  141. cognee/modules/users/tenants/methods/__init__.py +1 -0
  142. cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
  143. cognee/modules/users/tenants/methods/create_tenant.py +22 -8
  144. cognee/modules/users/tenants/methods/select_tenant.py +62 -0
  145. cognee/shared/logging_utils.py +6 -0
  146. cognee/shared/rate_limiting.py +30 -0
  147. cognee/tasks/chunks/__init__.py +1 -0
  148. cognee/tasks/chunks/chunk_by_row.py +94 -0
  149. cognee/tasks/documents/__init__.py +0 -1
  150. cognee/tasks/documents/classify_documents.py +2 -0
  151. cognee/tasks/feedback/generate_improved_answers.py +3 -3
  152. cognee/tasks/graph/extract_graph_from_data.py +9 -10
  153. cognee/tasks/ingestion/ingest_data.py +1 -1
  154. cognee/tasks/memify/__init__.py +2 -0
  155. cognee/tasks/memify/cognify_session.py +41 -0
  156. cognee/tasks/memify/extract_user_sessions.py +73 -0
  157. cognee/tasks/memify/get_triplet_datapoints.py +289 -0
  158. cognee/tasks/storage/add_data_points.py +142 -2
  159. cognee/tasks/storage/index_data_points.py +33 -22
  160. cognee/tasks/storage/index_graph_edges.py +37 -57
  161. cognee/tests/integration/documents/CsvDocument_test.py +70 -0
  162. cognee/tests/integration/retrieval/test_triplet_retriever.py +84 -0
  163. cognee/tests/integration/tasks/test_add_data_points.py +139 -0
  164. cognee/tests/integration/tasks/test_get_triplet_datapoints.py +69 -0
  165. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
  166. cognee/tests/test_add_docling_document.py +2 -2
  167. cognee/tests/test_cognee_server_start.py +84 -3
  168. cognee/tests/test_conversation_history.py +68 -5
  169. cognee/tests/test_data/example_with_header.csv +3 -0
  170. cognee/tests/test_dataset_database_handler.py +137 -0
  171. cognee/tests/test_dataset_delete.py +76 -0
  172. cognee/tests/test_edge_centered_payload.py +170 -0
  173. cognee/tests/test_edge_ingestion.py +27 -0
  174. cognee/tests/test_feedback_enrichment.py +1 -1
  175. cognee/tests/test_library.py +6 -4
  176. cognee/tests/test_load.py +62 -0
  177. cognee/tests/test_multi_tenancy.py +165 -0
  178. cognee/tests/test_parallel_databases.py +2 -0
  179. cognee/tests/test_pipeline_cache.py +164 -0
  180. cognee/tests/test_relational_db_migration.py +54 -2
  181. cognee/tests/test_search_db.py +44 -2
  182. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
  183. cognee/tests/unit/api/test_ontology_endpoint.py +252 -0
  184. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
  185. cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
  186. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
  187. cognee/tests/unit/infrastructure/llm/test_llm_config.py +46 -0
  188. cognee/tests/unit/infrastructure/mock_embedding_engine.py +3 -7
  189. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +0 -5
  190. cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
  191. cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
  192. cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
  193. cognee/tests/unit/modules/graph/cognee_graph_test.py +406 -0
  194. cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
  195. cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
  196. cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +214 -0
  197. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
  198. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
  199. cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
  200. cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
  201. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
  202. cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +608 -0
  203. cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +83 -0
  204. cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
  205. cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
  206. cognee/tests/unit/tasks/storage/test_add_data_points.py +288 -0
  207. {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/METADATA +11 -7
  208. {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/RECORD +212 -160
  209. {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/entry_points.txt +0 -1
  210. cognee/api/v1/cognify/code_graph_pipeline.py +0 -119
  211. cognee/api/v1/cognify/routers/get_code_pipeline_router.py +0 -90
  212. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +0 -544
  213. cognee/modules/retrieval/code_retriever.py +0 -232
  214. cognee/tasks/code/enrich_dependency_graph_checker.py +0 -35
  215. cognee/tasks/code/get_local_dependencies_checker.py +0 -20
  216. cognee/tasks/code/get_repo_dependency_graph_checker.py +0 -35
  217. cognee/tasks/documents/check_permissions_on_dataset.py +0 -26
  218. cognee/tasks/repo_processor/__init__.py +0 -2
  219. cognee/tasks/repo_processor/get_local_dependencies.py +0 -335
  220. cognee/tasks/repo_processor/get_non_code_files.py +0 -158
  221. cognee/tasks/repo_processor/get_repo_file_dependencies.py +0 -243
  222. {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/WHEEL +0 -0
  223. {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/licenses/LICENSE +0 -0
  224. {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,10 @@
1
+ from .supported_dataset_database_handlers import supported_dataset_database_handlers
2
+
3
+
4
+ def use_dataset_database_handler(
5
+ dataset_database_handler_name, dataset_database_handler, dataset_database_provider
6
+ ):
7
+ supported_dataset_database_handlers[dataset_database_handler_name] = {
8
+ "handler_instance": dataset_database_handler,
9
+ "handler_provider": dataset_database_provider,
10
+ }
@@ -148,3 +148,19 @@ class CacheConnectionError(CogneeConfigurationError):
148
148
  status_code: int = status.HTTP_503_SERVICE_UNAVAILABLE,
149
149
  ):
150
150
  super().__init__(message, name, status_code)
151
+
152
+
153
+ class SharedKuzuLockRequiresRedisError(CogneeConfigurationError):
154
+ """
155
+ Raised when shared Kuzu locking is requested without configuring the Redis backend.
156
+ """
157
+
158
+ def __init__(
159
+ self,
160
+ message: str = (
161
+ "Shared Kuzu lock requires Redis cache backend. Configure Redis to enable shared Kuzu locking."
162
+ ),
163
+ name: str = "SharedKuzuLockRequiresRedisError",
164
+ status_code: int = status.HTTP_400_BAD_REQUEST,
165
+ ):
166
+ super().__init__(message, name, status_code)
@@ -26,6 +26,7 @@ class GraphConfig(BaseSettings):
26
26
  - graph_database_username
27
27
  - graph_database_password
28
28
  - graph_database_port
29
+ - graph_database_key
29
30
  - graph_file_path
30
31
  - graph_model
31
32
  - graph_topology
@@ -41,10 +42,12 @@ class GraphConfig(BaseSettings):
41
42
  graph_database_username: str = ""
42
43
  graph_database_password: str = ""
43
44
  graph_database_port: int = 123
45
+ graph_database_key: str = ""
44
46
  graph_file_path: str = ""
45
47
  graph_filename: str = ""
46
48
  graph_model: object = KnowledgeGraph
47
49
  graph_topology: object = KnowledgeGraph
50
+ graph_dataset_database_handler: str = "kuzu"
48
51
  model_config = SettingsConfigDict(env_file=".env", extra="allow", populate_by_name=True)
49
52
 
50
53
  # Model validator updates graph_filename and path dynamically after class creation based on current database provider
@@ -90,10 +93,12 @@ class GraphConfig(BaseSettings):
90
93
  "graph_database_username": self.graph_database_username,
91
94
  "graph_database_password": self.graph_database_password,
92
95
  "graph_database_port": self.graph_database_port,
96
+ "graph_database_key": self.graph_database_key,
93
97
  "graph_file_path": self.graph_file_path,
94
98
  "graph_model": self.graph_model,
95
99
  "graph_topology": self.graph_topology,
96
100
  "model_config": self.model_config,
101
+ "graph_dataset_database_handler": self.graph_dataset_database_handler,
97
102
  }
98
103
 
99
104
  def to_hashable_dict(self) -> dict:
@@ -116,7 +121,9 @@ class GraphConfig(BaseSettings):
116
121
  "graph_database_username": self.graph_database_username,
117
122
  "graph_database_password": self.graph_database_password,
118
123
  "graph_database_port": self.graph_database_port,
124
+ "graph_database_key": self.graph_database_key,
119
125
  "graph_file_path": self.graph_file_path,
126
+ "graph_dataset_database_handler": self.graph_dataset_database_handler,
120
127
  }
121
128
 
122
129
 
@@ -33,6 +33,8 @@ def create_graph_engine(
33
33
  graph_database_username="",
34
34
  graph_database_password="",
35
35
  graph_database_port="",
36
+ graph_database_key="",
37
+ graph_dataset_database_handler="",
36
38
  ):
37
39
  """
38
40
  Create a graph engine based on the specified provider type.
@@ -69,6 +71,7 @@ def create_graph_engine(
69
71
  graph_database_url=graph_database_url,
70
72
  graph_database_username=graph_database_username,
71
73
  graph_database_password=graph_database_password,
74
+ database_name=graph_database_name,
72
75
  )
73
76
 
74
77
  if graph_database_provider == "neo4j":
@@ -398,3 +398,18 @@ class GraphDBInterface(ABC):
398
398
  - node_id (Union[str, UUID]): Unique identifier of the node for which to retrieve connections.
399
399
  """
400
400
  raise NotImplementedError
401
+
402
+ @abstractmethod
403
+ async def get_filtered_graph_data(
404
+ self, attribute_filters: List[Dict[str, List[Union[str, int]]]]
405
+ ) -> Tuple[List[Node], List[EdgeData]]:
406
+ """
407
+ Retrieve nodes and edges filtered by the provided attribute criteria.
408
+
409
+ Parameters:
410
+ -----------
411
+
412
+ - attribute_filters: A list of dictionaries where keys are attribute names and values
413
+ are lists of attribute values to filter by.
414
+ """
415
+ raise NotImplementedError
@@ -0,0 +1,81 @@
1
+ import os
2
+ from uuid import UUID
3
+ from typing import Optional
4
+
5
+ from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine
6
+ from cognee.base_config import get_base_config
7
+ from cognee.modules.users.models import User
8
+ from cognee.modules.users.models import DatasetDatabase
9
+ from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface
10
+
11
+
12
+ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):
13
+ """
14
+ Handler for interacting with Kuzu Dataset databases.
15
+ """
16
+
17
+ @classmethod
18
+ async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict:
19
+ """
20
+ Create a new Kuzu instance for the dataset. Return connection info that will be mapped to the dataset.
21
+
22
+ Args:
23
+ dataset_id: Dataset UUID
24
+ user: User object who owns the dataset and is making the request
25
+
26
+ Returns:
27
+ dict: Connection details for the created Kuzu instance
28
+
29
+ """
30
+ from cognee.infrastructure.databases.graph.config import get_graph_config
31
+
32
+ graph_config = get_graph_config()
33
+
34
+ if graph_config.graph_database_provider != "kuzu":
35
+ raise ValueError(
36
+ "KuzuDatasetDatabaseHandler can only be used with Kuzu graph database provider."
37
+ )
38
+
39
+ graph_db_name = f"{dataset_id}.pkl"
40
+ graph_db_url = graph_config.graph_database_url
41
+ graph_db_key = graph_config.graph_database_key
42
+ graph_db_username = graph_config.graph_database_username
43
+ graph_db_password = graph_config.graph_database_password
44
+
45
+ return {
46
+ "graph_database_name": graph_db_name,
47
+ "graph_database_url": graph_db_url,
48
+ "graph_database_provider": graph_config.graph_database_provider,
49
+ "graph_database_key": graph_db_key,
50
+ "graph_dataset_database_handler": "kuzu",
51
+ "graph_database_connection_info": {
52
+ "graph_database_username": graph_db_username,
53
+ "graph_database_password": graph_db_password,
54
+ },
55
+ }
56
+
57
+ @classmethod
58
+ async def delete_dataset(cls, dataset_database: DatasetDatabase):
59
+ base_config = get_base_config()
60
+ databases_directory_path = os.path.join(
61
+ base_config.system_root_directory, "databases", str(dataset_database.owner_id)
62
+ )
63
+ graph_file_path = os.path.join(
64
+ databases_directory_path, dataset_database.graph_database_name
65
+ )
66
+ graph_engine = create_graph_engine(
67
+ graph_database_provider=dataset_database.graph_database_provider,
68
+ graph_database_url=dataset_database.graph_database_url,
69
+ graph_database_name=dataset_database.graph_database_name,
70
+ graph_database_key=dataset_database.graph_database_key,
71
+ graph_file_path=graph_file_path,
72
+ graph_database_username=dataset_database.graph_database_connection_info.get(
73
+ "graph_database_username", ""
74
+ ),
75
+ graph_database_password=dataset_database.graph_database_connection_info.get(
76
+ "graph_database_password", ""
77
+ ),
78
+ graph_dataset_database_handler="",
79
+ graph_database_port="",
80
+ )
81
+ await graph_engine.delete_graph()
@@ -12,6 +12,7 @@ from contextlib import asynccontextmanager
12
12
  from concurrent.futures import ThreadPoolExecutor
13
13
  from typing import Dict, Any, List, Union, Optional, Tuple, Type
14
14
 
15
+ from cognee.exceptions import CogneeValidationError
15
16
  from cognee.shared.logging_utils import get_logger
16
17
  from cognee.infrastructure.utils.run_sync import run_sync
17
18
  from cognee.infrastructure.files.storage import get_file_storage
@@ -1186,6 +1187,11 @@ class KuzuAdapter(GraphDBInterface):
1186
1187
  A tuple with two elements: a list of tuples of (node_id, properties) and a list of
1187
1188
  tuples of (source_id, target_id, relationship_name, properties).
1188
1189
  """
1190
+
1191
+ import time
1192
+
1193
+ start_time = time.time()
1194
+
1189
1195
  try:
1190
1196
  nodes_query = """
1191
1197
  MATCH (n:Node)
@@ -1249,6 +1255,11 @@ class KuzuAdapter(GraphDBInterface):
1249
1255
  },
1250
1256
  )
1251
1257
  )
1258
+
1259
+ retrieval_time = time.time() - start_time
1260
+ logger.info(
1261
+ f"Retrieved {len(nodes)} nodes and {len(edges)} edges in {retrieval_time:.2f} seconds"
1262
+ )
1252
1263
  return formatted_nodes, formatted_edges
1253
1264
  except Exception as e:
1254
1265
  logger.error(f"Failed to get graph data: {e}")
@@ -1417,6 +1428,92 @@ class KuzuAdapter(GraphDBInterface):
1417
1428
  formatted_edges.append((source_id, target_id, rel_type, props))
1418
1429
  return formatted_nodes, formatted_edges
1419
1430
 
1431
+ async def get_id_filtered_graph_data(self, target_ids: list[str]):
1432
+ """
1433
+ Retrieve graph data filtered by specific node IDs, including their direct neighbors
1434
+ and only edges where one endpoint matches those IDs.
1435
+
1436
+ Returns:
1437
+ nodes: List[dict] -> Each dict includes "id" and all node properties
1438
+ edges: List[dict] -> Each dict includes "source", "target", "type", "properties"
1439
+ """
1440
+ import time
1441
+
1442
+ start_time = time.time()
1443
+
1444
+ try:
1445
+ if not target_ids:
1446
+ logger.warning("No target IDs provided for ID-filtered graph retrieval.")
1447
+ return [], []
1448
+
1449
+ if not all(isinstance(x, str) for x in target_ids):
1450
+ raise CogneeValidationError("target_ids must be a list of strings")
1451
+
1452
+ query = """
1453
+ MATCH (n:Node)-[r]->(m:Node)
1454
+ WHERE n.id IN $target_ids OR m.id IN $target_ids
1455
+ RETURN n.id, {
1456
+ name: n.name,
1457
+ type: n.type,
1458
+ properties: n.properties
1459
+ }, m.id, {
1460
+ name: m.name,
1461
+ type: m.type,
1462
+ properties: m.properties
1463
+ }, r.relationship_name, r.properties
1464
+ """
1465
+
1466
+ result = await self.query(query, {"target_ids": target_ids})
1467
+
1468
+ if not result:
1469
+ logger.info("No data returned for the supplied IDs")
1470
+ return [], []
1471
+
1472
+ nodes_dict = {}
1473
+ edges = []
1474
+
1475
+ for n_id, n_props, m_id, m_props, r_type, r_props_raw in result:
1476
+ if n_props.get("properties"):
1477
+ try:
1478
+ additional_props = json.loads(n_props["properties"])
1479
+ n_props.update(additional_props)
1480
+ del n_props["properties"]
1481
+ except json.JSONDecodeError:
1482
+ logger.warning(f"Failed to parse properties JSON for node {n_id}")
1483
+
1484
+ if m_props.get("properties"):
1485
+ try:
1486
+ additional_props = json.loads(m_props["properties"])
1487
+ m_props.update(additional_props)
1488
+ del m_props["properties"]
1489
+ except json.JSONDecodeError:
1490
+ logger.warning(f"Failed to parse properties JSON for node {m_id}")
1491
+
1492
+ nodes_dict[n_id] = (n_id, n_props)
1493
+ nodes_dict[m_id] = (m_id, m_props)
1494
+
1495
+ edge_props = {}
1496
+ if r_props_raw:
1497
+ try:
1498
+ edge_props = json.loads(r_props_raw)
1499
+ except (json.JSONDecodeError, TypeError):
1500
+ logger.warning(f"Failed to parse edge properties for {n_id}->{m_id}")
1501
+
1502
+ source_id = edge_props.get("source_node_id", n_id)
1503
+ target_id = edge_props.get("target_node_id", m_id)
1504
+ edges.append((source_id, target_id, r_type, edge_props))
1505
+
1506
+ retrieval_time = time.time() - start_time
1507
+ logger.info(
1508
+ f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s"
1509
+ )
1510
+
1511
+ return list(nodes_dict.values()), edges
1512
+
1513
+ except Exception as e:
1514
+ logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}")
1515
+ raise
1516
+
1420
1517
  async def get_graph_metrics(self, include_optional=False) -> Dict[str, Any]:
1421
1518
  """
1422
1519
  Get metrics on graph structure and connectivity.
@@ -1908,3 +2005,134 @@ class KuzuAdapter(GraphDBInterface):
1908
2005
  time_ids_list = [item[0] for item in time_nodes]
1909
2006
 
1910
2007
  return ", ".join(f"'{uid}'" for uid in time_ids_list)
2008
+
2009
+ async def get_triplets_batch(self, offset: int, limit: int) -> list[dict[str, Any]]:
2010
+ """
2011
+ Retrieve a batch of triplets (start_node, relationship, end_node) from the graph.
2012
+
2013
+ Parameters:
2014
+ -----------
2015
+ - offset (int): Number of triplets to skip before returning results.
2016
+ - limit (int): Maximum number of triplets to return.
2017
+
2018
+ Returns:
2019
+ --------
2020
+ - list[dict[str, Any]]: A list of triplets, where each triplet is a dictionary
2021
+ with keys: 'start_node', 'relationship_properties', 'end_node'.
2022
+
2023
+ Raises:
2024
+ -------
2025
+ - ValueError: If offset or limit are negative.
2026
+ - Exception: Re-raises any exceptions from query execution.
2027
+ """
2028
+ if offset < 0:
2029
+ raise ValueError(f"Offset must be non-negative, got {offset}")
2030
+ if limit < 0:
2031
+ raise ValueError(f"Limit must be non-negative, got {limit}")
2032
+
2033
+ query = """
2034
+ MATCH (start_node:Node)-[relationship:EDGE]->(end_node:Node)
2035
+ RETURN {
2036
+ start_node: {
2037
+ id: start_node.id,
2038
+ name: start_node.name,
2039
+ type: start_node.type,
2040
+ properties: start_node.properties
2041
+ },
2042
+ relationship_properties: {
2043
+ relationship_name: relationship.relationship_name,
2044
+ properties: relationship.properties
2045
+ },
2046
+ end_node: {
2047
+ id: end_node.id,
2048
+ name: end_node.name,
2049
+ type: end_node.type,
2050
+ properties: end_node.properties
2051
+ }
2052
+ } AS triplet
2053
+ SKIP $offset LIMIT $limit
2054
+ """
2055
+
2056
+ try:
2057
+ results = await self.query(query, {"offset": offset, "limit": limit})
2058
+ except Exception as e:
2059
+ logger.error(f"Failed to execute triplet query: {str(e)}")
2060
+ logger.error(f"Query: {query}")
2061
+ logger.error(f"Parameters: offset={offset}, limit={limit}")
2062
+ raise
2063
+
2064
+ triplets = []
2065
+ for idx, row in enumerate(results):
2066
+ try:
2067
+ if not row or len(row) == 0:
2068
+ logger.warning(f"Skipping empty row at index {idx} in triplet batch")
2069
+ continue
2070
+
2071
+ if not isinstance(row[0], dict):
2072
+ logger.warning(
2073
+ f"Skipping invalid row at index {idx}: expected dict, got {type(row[0])}"
2074
+ )
2075
+ continue
2076
+
2077
+ triplet = row[0]
2078
+
2079
+ if "start_node" not in triplet:
2080
+ logger.warning(f"Skipping triplet at index {idx}: missing 'start_node' key")
2081
+ continue
2082
+
2083
+ if not isinstance(triplet["start_node"], dict):
2084
+ logger.warning(f"Skipping triplet at index {idx}: 'start_node' is not a dict")
2085
+ continue
2086
+
2087
+ triplet["start_node"] = self._parse_node_properties(triplet["start_node"].copy())
2088
+
2089
+ if "relationship_properties" not in triplet:
2090
+ logger.warning(
2091
+ f"Skipping triplet at index {idx}: missing 'relationship_properties' key"
2092
+ )
2093
+ continue
2094
+
2095
+ if not isinstance(triplet["relationship_properties"], dict):
2096
+ logger.warning(
2097
+ f"Skipping triplet at index {idx}: 'relationship_properties' is not a dict"
2098
+ )
2099
+ continue
2100
+
2101
+ rel_props = triplet["relationship_properties"].copy()
2102
+ relationship_name = rel_props.get("relationship_name") or ""
2103
+
2104
+ if rel_props.get("properties"):
2105
+ try:
2106
+ parsed_props = json.loads(rel_props["properties"])
2107
+ if isinstance(parsed_props, dict):
2108
+ rel_props.update(parsed_props)
2109
+ del rel_props["properties"]
2110
+ else:
2111
+ logger.warning(
2112
+ f"Parsed relationship properties is not a dict for triplet at index {idx}"
2113
+ )
2114
+ except (json.JSONDecodeError, TypeError) as e:
2115
+ logger.warning(
2116
+ f"Failed to parse relationship properties JSON for triplet at index {idx}: {e}"
2117
+ )
2118
+
2119
+ rel_props["relationship_name"] = relationship_name
2120
+ triplet["relationship_properties"] = rel_props
2121
+
2122
+ if "end_node" not in triplet:
2123
+ logger.warning(f"Skipping triplet at index {idx}: missing 'end_node' key")
2124
+ continue
2125
+
2126
+ if not isinstance(triplet["end_node"], dict):
2127
+ logger.warning(f"Skipping triplet at index {idx}: 'end_node' is not a dict")
2128
+ continue
2129
+
2130
+ triplet["end_node"] = self._parse_node_properties(triplet["end_node"].copy())
2131
+
2132
+ triplets.append(triplet)
2133
+
2134
+ except Exception as e:
2135
+ logger.error(f"Error processing triplet at index {idx}: {e}", exc_info=True)
2136
+ continue
2137
+
2138
+ return triplets
@@ -0,0 +1,168 @@
1
+ import os
2
+ import asyncio
3
+ import requests
4
+ import base64
5
+ import hashlib
6
+ from uuid import UUID
7
+ from typing import Optional
8
+ from cryptography.fernet import Fernet
9
+
10
+ from cognee.infrastructure.databases.graph import get_graph_config
11
+ from cognee.modules.users.models import User, DatasetDatabase
12
+ from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface
13
+
14
+
15
+ class Neo4jAuraDevDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):
16
+ """
17
+ Handler for a quick development PoC integration of Cognee multi-user and permission mode with Neo4j Aura databases.
18
+ This handler creates a new Neo4j Aura instance for each Cognee dataset created.
19
+
20
+ Improvements needed to be production ready:
21
+ - Secret management for client credentials, currently secrets are encrypted and stored in the Cognee relational database,
22
+ a secret manager or a similar system should be used instead.
23
+
24
+ Quality of life improvements:
25
+ - Allow configuration of different Neo4j Aura plans and regions.
26
+ - Requests should be made async, currently a blocking requests library is used.
27
+ """
28
+
29
+ @classmethod
30
+ async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict:
31
+ """
32
+ Create a new Neo4j Aura instance for the dataset. Return connection info that will be mapped to the dataset.
33
+
34
+ Args:
35
+ dataset_id: Dataset UUID
36
+ user: User object who owns the dataset and is making the request
37
+
38
+ Returns:
39
+ dict: Connection details for the created Neo4j instance
40
+
41
+ """
42
+ graph_config = get_graph_config()
43
+
44
+ if graph_config.graph_database_provider != "neo4j":
45
+ raise ValueError(
46
+ "Neo4jAuraDevDatasetDatabaseHandler can only be used with Neo4j graph database provider."
47
+ )
48
+
49
+ graph_db_name = f"{dataset_id}"
50
+
51
+ # Client credentials and encryption
52
+ client_id = os.environ.get("NEO4J_CLIENT_ID", None)
53
+ client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None)
54
+ tenant_id = os.environ.get("NEO4J_TENANT_ID", None)
55
+ encryption_env_key = os.environ.get("NEO4J_ENCRYPTION_KEY", "test_key")
56
+ encryption_key = base64.urlsafe_b64encode(
57
+ hashlib.sha256(encryption_env_key.encode()).digest()
58
+ )
59
+ cipher = Fernet(encryption_key)
60
+
61
+ if client_id is None or client_secret is None or tenant_id is None:
62
+ raise ValueError(
63
+ "NEO4J_CLIENT_ID, NEO4J_CLIENT_SECRET, and NEO4J_TENANT_ID environment variables must be set to use Neo4j Aura DatasetDatabase Handling."
64
+ )
65
+
66
+ # Make the request with HTTP Basic Auth
67
+ def get_aura_token(client_id: str, client_secret: str) -> dict:
68
+ url = "https://api.neo4j.io/oauth/token"
69
+ data = {"grant_type": "client_credentials"} # sent as application/x-www-form-urlencoded
70
+
71
+ resp = requests.post(url, data=data, auth=(client_id, client_secret))
72
+ resp.raise_for_status() # raises if the request failed
73
+ return resp.json()
74
+
75
+ resp = get_aura_token(client_id, client_secret)
76
+
77
+ url = "https://api.neo4j.io/v1/instances"
78
+
79
+ headers = {
80
+ "accept": "application/json",
81
+ "Authorization": f"Bearer {resp['access_token']}",
82
+ "Content-Type": "application/json",
83
+ }
84
+
85
+ # TODO: Maybe we can allow **kwargs parameter forwarding for cases like these
86
+ # Too allow different configurations between datasets
87
+ payload = {
88
+ "version": "5",
89
+ "region": "europe-west1",
90
+ "memory": "1GB",
91
+ "name": graph_db_name[
92
+ 0:29
93
+ ], # TODO: Find better name to name Neo4j instance within 30 character limit
94
+ "type": "professional-db",
95
+ "tenant_id": tenant_id,
96
+ "cloud_provider": "gcp",
97
+ }
98
+
99
+ response = requests.post(url, headers=headers, json=payload)
100
+
101
+ graph_db_name = "neo4j" # Has to be 'neo4j' for Aura
102
+ graph_db_url = response.json()["data"]["connection_url"]
103
+ graph_db_key = resp["access_token"]
104
+ graph_db_username = response.json()["data"]["username"]
105
+ graph_db_password = response.json()["data"]["password"]
106
+
107
+ async def _wait_for_neo4j_instance_provisioning(instance_id: str, headers: dict):
108
+ # Poll until the instance is running
109
+ status_url = f"https://api.neo4j.io/v1/instances/{instance_id}"
110
+ status = ""
111
+ for attempt in range(30): # Try for up to ~5 minutes
112
+ status_resp = requests.get(
113
+ status_url, headers=headers
114
+ ) # TODO: Use async requests with httpx
115
+ status = status_resp.json()["data"]["status"]
116
+ if status.lower() == "running":
117
+ return
118
+ await asyncio.sleep(10)
119
+ raise TimeoutError(
120
+ f"Neo4j instance '{graph_db_name}' did not become ready within 5 minutes. Status: {status}"
121
+ )
122
+
123
+ instance_id = response.json()["data"]["id"]
124
+ await _wait_for_neo4j_instance_provisioning(instance_id, headers)
125
+
126
+ encrypted_db_password_bytes = cipher.encrypt(graph_db_password.encode())
127
+ encrypted_db_password_string = encrypted_db_password_bytes.decode()
128
+
129
+ return {
130
+ "graph_database_name": graph_db_name,
131
+ "graph_database_url": graph_db_url,
132
+ "graph_database_provider": "neo4j",
133
+ "graph_database_key": graph_db_key,
134
+ "graph_dataset_database_handler": "neo4j_aura_dev",
135
+ "graph_database_connection_info": {
136
+ "graph_database_username": graph_db_username,
137
+ "graph_database_password": encrypted_db_password_string,
138
+ },
139
+ }
140
+
141
+ @classmethod
142
+ async def resolve_dataset_connection_info(
143
+ cls, dataset_database: DatasetDatabase
144
+ ) -> DatasetDatabase:
145
+ """
146
+ Resolve and decrypt connection info for the Neo4j dataset database.
147
+ In this case, decrypt the password stored in the database.
148
+
149
+ Args:
150
+ dataset_database: DatasetDatabase instance containing encrypted connection info.
151
+ """
152
+ encryption_env_key = os.environ.get("NEO4J_ENCRYPTION_KEY", "test_key")
153
+ encryption_key = base64.urlsafe_b64encode(
154
+ hashlib.sha256(encryption_env_key.encode()).digest()
155
+ )
156
+ cipher = Fernet(encryption_key)
157
+ graph_db_password = cipher.decrypt(
158
+ dataset_database.graph_database_connection_info["graph_database_password"].encode()
159
+ ).decode()
160
+
161
+ dataset_database.graph_database_connection_info["graph_database_password"] = (
162
+ graph_db_password
163
+ )
164
+ return dataset_database
165
+
166
+ @classmethod
167
+ async def delete_dataset(cls, dataset_database: DatasetDatabase):
168
+ pass