cognee 0.5.0__py3-none-any.whl → 0.5.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. cognee/api/client.py +5 -1
  2. cognee/api/v1/add/add.py +1 -2
  3. cognee/api/v1/cognify/code_graph_pipeline.py +119 -0
  4. cognee/api/v1/cognify/cognify.py +16 -24
  5. cognee/api/v1/cognify/routers/__init__.py +1 -0
  6. cognee/api/v1/cognify/routers/get_code_pipeline_router.py +90 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +1 -3
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
  9. cognee/api/v1/ontologies/ontologies.py +37 -12
  10. cognee/api/v1/ontologies/routers/get_ontology_router.py +25 -27
  11. cognee/api/v1/search/search.py +0 -4
  12. cognee/api/v1/ui/ui.py +68 -38
  13. cognee/context_global_variables.py +16 -61
  14. cognee/eval_framework/answer_generation/answer_generation_executor.py +0 -10
  15. cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
  16. cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +2 -0
  17. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
  18. cognee/eval_framework/eval_config.py +2 -2
  19. cognee/eval_framework/modal_run_eval.py +28 -16
  20. cognee/infrastructure/databases/graph/config.py +0 -3
  21. cognee/infrastructure/databases/graph/get_graph_engine.py +0 -1
  22. cognee/infrastructure/databases/graph/graph_db_interface.py +0 -15
  23. cognee/infrastructure/databases/graph/kuzu/adapter.py +0 -228
  24. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +1 -80
  25. cognee/infrastructure/databases/utils/__init__.py +0 -3
  26. cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +48 -62
  27. cognee/infrastructure/databases/vector/config.py +0 -2
  28. cognee/infrastructure/databases/vector/create_vector_engine.py +0 -1
  29. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +6 -8
  30. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +7 -9
  31. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +10 -11
  32. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +544 -0
  33. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -2
  34. cognee/infrastructure/databases/vector/vector_db_interface.py +0 -35
  35. cognee/infrastructure/files/storage/s3_config.py +0 -2
  36. cognee/infrastructure/llm/LLMGateway.py +2 -5
  37. cognee/infrastructure/llm/config.py +0 -35
  38. cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
  39. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +8 -23
  40. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +16 -17
  41. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +37 -40
  42. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +36 -39
  43. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +1 -19
  44. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +9 -11
  45. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +21 -23
  46. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +34 -42
  47. cognee/modules/cognify/config.py +0 -2
  48. cognee/modules/data/deletion/prune_system.py +2 -52
  49. cognee/modules/data/methods/delete_dataset.py +0 -26
  50. cognee/modules/engine/models/__init__.py +0 -1
  51. cognee/modules/graph/cognee_graph/CogneeGraph.py +37 -85
  52. cognee/modules/graph/cognee_graph/CogneeGraphElements.py +3 -8
  53. cognee/modules/memify/memify.py +7 -1
  54. cognee/modules/pipelines/operations/pipeline.py +2 -18
  55. cognee/modules/retrieval/__init__.py +1 -1
  56. cognee/modules/retrieval/code_retriever.py +232 -0
  57. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -4
  58. cognee/modules/retrieval/graph_completion_cot_retriever.py +0 -4
  59. cognee/modules/retrieval/graph_completion_retriever.py +0 -10
  60. cognee/modules/retrieval/graph_summary_completion_retriever.py +0 -4
  61. cognee/modules/retrieval/temporal_retriever.py +0 -4
  62. cognee/modules/retrieval/utils/brute_force_triplet_search.py +10 -42
  63. cognee/modules/run_custom_pipeline/run_custom_pipeline.py +1 -8
  64. cognee/modules/search/methods/get_search_type_tools.py +8 -54
  65. cognee/modules/search/methods/no_access_control_search.py +0 -4
  66. cognee/modules/search/methods/search.py +0 -21
  67. cognee/modules/search/types/SearchType.py +1 -1
  68. cognee/modules/settings/get_settings.py +0 -19
  69. cognee/modules/users/methods/get_authenticated_user.py +2 -2
  70. cognee/modules/users/models/DatasetDatabase.py +3 -15
  71. cognee/shared/logging_utils.py +0 -4
  72. cognee/tasks/code/enrich_dependency_graph_checker.py +35 -0
  73. cognee/tasks/code/get_local_dependencies_checker.py +20 -0
  74. cognee/tasks/code/get_repo_dependency_graph_checker.py +35 -0
  75. cognee/tasks/documents/__init__.py +1 -0
  76. cognee/tasks/documents/check_permissions_on_dataset.py +26 -0
  77. cognee/tasks/graph/extract_graph_from_data.py +10 -9
  78. cognee/tasks/repo_processor/__init__.py +2 -0
  79. cognee/tasks/repo_processor/get_local_dependencies.py +335 -0
  80. cognee/tasks/repo_processor/get_non_code_files.py +158 -0
  81. cognee/tasks/repo_processor/get_repo_file_dependencies.py +243 -0
  82. cognee/tasks/storage/add_data_points.py +2 -142
  83. cognee/tests/test_cognee_server_start.py +4 -2
  84. cognee/tests/test_conversation_history.py +1 -23
  85. cognee/tests/test_delete_bmw_example.py +60 -0
  86. cognee/tests/test_search_db.py +1 -37
  87. cognee/tests/unit/api/test_ontology_endpoint.py +89 -77
  88. cognee/tests/unit/infrastructure/mock_embedding_engine.py +7 -3
  89. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -0
  90. cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
  91. cognee/tests/unit/modules/graph/cognee_graph_test.py +0 -406
  92. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/METADATA +89 -76
  93. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/RECORD +97 -118
  94. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/WHEEL +1 -1
  95. cognee/api/v1/ui/node_setup.py +0 -360
  96. cognee/api/v1/ui/npm_utils.py +0 -50
  97. cognee/eval_framework/Dockerfile +0 -29
  98. cognee/infrastructure/databases/dataset_database_handler/__init__.py +0 -3
  99. cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +0 -80
  100. cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +0 -18
  101. cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +0 -10
  102. cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +0 -81
  103. cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +0 -168
  104. cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +0 -10
  105. cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +0 -10
  106. cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +0 -30
  107. cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +0 -50
  108. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +0 -5
  109. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +0 -153
  110. cognee/memify_pipelines/create_triplet_embeddings.py +0 -53
  111. cognee/modules/engine/models/Triplet.py +0 -9
  112. cognee/modules/retrieval/register_retriever.py +0 -10
  113. cognee/modules/retrieval/registered_community_retrievers.py +0 -1
  114. cognee/modules/retrieval/triplet_retriever.py +0 -182
  115. cognee/shared/rate_limiting.py +0 -30
  116. cognee/tasks/memify/get_triplet_datapoints.py +0 -289
  117. cognee/tests/integration/retrieval/test_triplet_retriever.py +0 -84
  118. cognee/tests/integration/tasks/test_add_data_points.py +0 -139
  119. cognee/tests/integration/tasks/test_get_triplet_datapoints.py +0 -69
  120. cognee/tests/test_dataset_database_handler.py +0 -137
  121. cognee/tests/test_dataset_delete.py +0 -76
  122. cognee/tests/test_edge_centered_payload.py +0 -170
  123. cognee/tests/test_pipeline_cache.py +0 -164
  124. cognee/tests/unit/infrastructure/llm/test_llm_config.py +0 -46
  125. cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +0 -214
  126. cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +0 -608
  127. cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +0 -83
  128. cognee/tests/unit/tasks/storage/test_add_data_points.py +0 -288
  129. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/entry_points.txt +0 -0
  130. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/LICENSE +0 -0
  131. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
@@ -12,7 +12,6 @@ from contextlib import asynccontextmanager
12
12
  from concurrent.futures import ThreadPoolExecutor
13
13
  from typing import Dict, Any, List, Union, Optional, Tuple, Type
14
14
 
15
- from cognee.exceptions import CogneeValidationError
16
15
  from cognee.shared.logging_utils import get_logger
17
16
  from cognee.infrastructure.utils.run_sync import run_sync
18
17
  from cognee.infrastructure.files.storage import get_file_storage
@@ -1187,11 +1186,6 @@ class KuzuAdapter(GraphDBInterface):
1187
1186
  A tuple with two elements: a list of tuples of (node_id, properties) and a list of
1188
1187
  tuples of (source_id, target_id, relationship_name, properties).
1189
1188
  """
1190
-
1191
- import time
1192
-
1193
- start_time = time.time()
1194
-
1195
1189
  try:
1196
1190
  nodes_query = """
1197
1191
  MATCH (n:Node)
@@ -1255,11 +1249,6 @@ class KuzuAdapter(GraphDBInterface):
1255
1249
  },
1256
1250
  )
1257
1251
  )
1258
-
1259
- retrieval_time = time.time() - start_time
1260
- logger.info(
1261
- f"Retrieved {len(nodes)} nodes and {len(edges)} edges in {retrieval_time:.2f} seconds"
1262
- )
1263
1252
  return formatted_nodes, formatted_edges
1264
1253
  except Exception as e:
1265
1254
  logger.error(f"Failed to get graph data: {e}")
@@ -1428,92 +1417,6 @@ class KuzuAdapter(GraphDBInterface):
1428
1417
  formatted_edges.append((source_id, target_id, rel_type, props))
1429
1418
  return formatted_nodes, formatted_edges
1430
1419
 
1431
- async def get_id_filtered_graph_data(self, target_ids: list[str]):
1432
- """
1433
- Retrieve graph data filtered by specific node IDs, including their direct neighbors
1434
- and only edges where one endpoint matches those IDs.
1435
-
1436
- Returns:
1437
- nodes: List[dict] -> Each dict includes "id" and all node properties
1438
- edges: List[dict] -> Each dict includes "source", "target", "type", "properties"
1439
- """
1440
- import time
1441
-
1442
- start_time = time.time()
1443
-
1444
- try:
1445
- if not target_ids:
1446
- logger.warning("No target IDs provided for ID-filtered graph retrieval.")
1447
- return [], []
1448
-
1449
- if not all(isinstance(x, str) for x in target_ids):
1450
- raise CogneeValidationError("target_ids must be a list of strings")
1451
-
1452
- query = """
1453
- MATCH (n:Node)-[r]->(m:Node)
1454
- WHERE n.id IN $target_ids OR m.id IN $target_ids
1455
- RETURN n.id, {
1456
- name: n.name,
1457
- type: n.type,
1458
- properties: n.properties
1459
- }, m.id, {
1460
- name: m.name,
1461
- type: m.type,
1462
- properties: m.properties
1463
- }, r.relationship_name, r.properties
1464
- """
1465
-
1466
- result = await self.query(query, {"target_ids": target_ids})
1467
-
1468
- if not result:
1469
- logger.info("No data returned for the supplied IDs")
1470
- return [], []
1471
-
1472
- nodes_dict = {}
1473
- edges = []
1474
-
1475
- for n_id, n_props, m_id, m_props, r_type, r_props_raw in result:
1476
- if n_props.get("properties"):
1477
- try:
1478
- additional_props = json.loads(n_props["properties"])
1479
- n_props.update(additional_props)
1480
- del n_props["properties"]
1481
- except json.JSONDecodeError:
1482
- logger.warning(f"Failed to parse properties JSON for node {n_id}")
1483
-
1484
- if m_props.get("properties"):
1485
- try:
1486
- additional_props = json.loads(m_props["properties"])
1487
- m_props.update(additional_props)
1488
- del m_props["properties"]
1489
- except json.JSONDecodeError:
1490
- logger.warning(f"Failed to parse properties JSON for node {m_id}")
1491
-
1492
- nodes_dict[n_id] = (n_id, n_props)
1493
- nodes_dict[m_id] = (m_id, m_props)
1494
-
1495
- edge_props = {}
1496
- if r_props_raw:
1497
- try:
1498
- edge_props = json.loads(r_props_raw)
1499
- except (json.JSONDecodeError, TypeError):
1500
- logger.warning(f"Failed to parse edge properties for {n_id}->{m_id}")
1501
-
1502
- source_id = edge_props.get("source_node_id", n_id)
1503
- target_id = edge_props.get("target_node_id", m_id)
1504
- edges.append((source_id, target_id, r_type, edge_props))
1505
-
1506
- retrieval_time = time.time() - start_time
1507
- logger.info(
1508
- f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s"
1509
- )
1510
-
1511
- return list(nodes_dict.values()), edges
1512
-
1513
- except Exception as e:
1514
- logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}")
1515
- raise
1516
-
1517
1420
  async def get_graph_metrics(self, include_optional=False) -> Dict[str, Any]:
1518
1421
  """
1519
1422
  Get metrics on graph structure and connectivity.
@@ -2005,134 +1908,3 @@ class KuzuAdapter(GraphDBInterface):
2005
1908
  time_ids_list = [item[0] for item in time_nodes]
2006
1909
 
2007
1910
  return ", ".join(f"'{uid}'" for uid in time_ids_list)
2008
-
2009
- async def get_triplets_batch(self, offset: int, limit: int) -> list[dict[str, Any]]:
2010
- """
2011
- Retrieve a batch of triplets (start_node, relationship, end_node) from the graph.
2012
-
2013
- Parameters:
2014
- -----------
2015
- - offset (int): Number of triplets to skip before returning results.
2016
- - limit (int): Maximum number of triplets to return.
2017
-
2018
- Returns:
2019
- --------
2020
- - list[dict[str, Any]]: A list of triplets, where each triplet is a dictionary
2021
- with keys: 'start_node', 'relationship_properties', 'end_node'.
2022
-
2023
- Raises:
2024
- -------
2025
- - ValueError: If offset or limit are negative.
2026
- - Exception: Re-raises any exceptions from query execution.
2027
- """
2028
- if offset < 0:
2029
- raise ValueError(f"Offset must be non-negative, got {offset}")
2030
- if limit < 0:
2031
- raise ValueError(f"Limit must be non-negative, got {limit}")
2032
-
2033
- query = """
2034
- MATCH (start_node:Node)-[relationship:EDGE]->(end_node:Node)
2035
- RETURN {
2036
- start_node: {
2037
- id: start_node.id,
2038
- name: start_node.name,
2039
- type: start_node.type,
2040
- properties: start_node.properties
2041
- },
2042
- relationship_properties: {
2043
- relationship_name: relationship.relationship_name,
2044
- properties: relationship.properties
2045
- },
2046
- end_node: {
2047
- id: end_node.id,
2048
- name: end_node.name,
2049
- type: end_node.type,
2050
- properties: end_node.properties
2051
- }
2052
- } AS triplet
2053
- SKIP $offset LIMIT $limit
2054
- """
2055
-
2056
- try:
2057
- results = await self.query(query, {"offset": offset, "limit": limit})
2058
- except Exception as e:
2059
- logger.error(f"Failed to execute triplet query: {str(e)}")
2060
- logger.error(f"Query: {query}")
2061
- logger.error(f"Parameters: offset={offset}, limit={limit}")
2062
- raise
2063
-
2064
- triplets = []
2065
- for idx, row in enumerate(results):
2066
- try:
2067
- if not row or len(row) == 0:
2068
- logger.warning(f"Skipping empty row at index {idx} in triplet batch")
2069
- continue
2070
-
2071
- if not isinstance(row[0], dict):
2072
- logger.warning(
2073
- f"Skipping invalid row at index {idx}: expected dict, got {type(row[0])}"
2074
- )
2075
- continue
2076
-
2077
- triplet = row[0]
2078
-
2079
- if "start_node" not in triplet:
2080
- logger.warning(f"Skipping triplet at index {idx}: missing 'start_node' key")
2081
- continue
2082
-
2083
- if not isinstance(triplet["start_node"], dict):
2084
- logger.warning(f"Skipping triplet at index {idx}: 'start_node' is not a dict")
2085
- continue
2086
-
2087
- triplet["start_node"] = self._parse_node_properties(triplet["start_node"].copy())
2088
-
2089
- if "relationship_properties" not in triplet:
2090
- logger.warning(
2091
- f"Skipping triplet at index {idx}: missing 'relationship_properties' key"
2092
- )
2093
- continue
2094
-
2095
- if not isinstance(triplet["relationship_properties"], dict):
2096
- logger.warning(
2097
- f"Skipping triplet at index {idx}: 'relationship_properties' is not a dict"
2098
- )
2099
- continue
2100
-
2101
- rel_props = triplet["relationship_properties"].copy()
2102
- relationship_name = rel_props.get("relationship_name") or ""
2103
-
2104
- if rel_props.get("properties"):
2105
- try:
2106
- parsed_props = json.loads(rel_props["properties"])
2107
- if isinstance(parsed_props, dict):
2108
- rel_props.update(parsed_props)
2109
- del rel_props["properties"]
2110
- else:
2111
- logger.warning(
2112
- f"Parsed relationship properties is not a dict for triplet at index {idx}"
2113
- )
2114
- except (json.JSONDecodeError, TypeError) as e:
2115
- logger.warning(
2116
- f"Failed to parse relationship properties JSON for triplet at index {idx}: {e}"
2117
- )
2118
-
2119
- rel_props["relationship_name"] = relationship_name
2120
- triplet["relationship_properties"] = rel_props
2121
-
2122
- if "end_node" not in triplet:
2123
- logger.warning(f"Skipping triplet at index {idx}: missing 'end_node' key")
2124
- continue
2125
-
2126
- if not isinstance(triplet["end_node"], dict):
2127
- logger.warning(f"Skipping triplet at index {idx}: 'end_node' is not a dict")
2128
- continue
2129
-
2130
- triplet["end_node"] = self._parse_node_properties(triplet["end_node"].copy())
2131
-
2132
- triplets.append(triplet)
2133
-
2134
- except Exception as e:
2135
- logger.error(f"Error processing triplet at index {idx}: {e}", exc_info=True)
2136
- continue
2137
-
2138
- return triplets
@@ -8,7 +8,7 @@ from neo4j import AsyncSession
8
8
  from neo4j import AsyncGraphDatabase
9
9
  from neo4j.exceptions import Neo4jError
10
10
  from contextlib import asynccontextmanager
11
- from typing import Optional, Any, List, Dict, Type, Tuple, Coroutine
11
+ from typing import Optional, Any, List, Dict, Type, Tuple
12
12
 
13
13
  from cognee.infrastructure.engine import DataPoint
14
14
  from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int
@@ -964,63 +964,6 @@ class Neo4jAdapter(GraphDBInterface):
964
964
  logger.error(f"Error during graph data retrieval: {str(e)}")
965
965
  raise
966
966
 
967
- async def get_id_filtered_graph_data(self, target_ids: list[str]):
968
- """
969
- Retrieve graph data filtered by specific node IDs, including their direct neighbors
970
- and only edges where one endpoint matches those IDs.
971
-
972
- This version uses a single Cypher query for efficiency.
973
- """
974
- import time
975
-
976
- start_time = time.time()
977
-
978
- try:
979
- if not target_ids:
980
- logger.warning("No target IDs provided for ID-filtered graph retrieval.")
981
- return [], []
982
-
983
- query = """
984
- MATCH ()-[r]-()
985
- WHERE startNode(r).id IN $target_ids
986
- OR endNode(r).id IN $target_ids
987
- WITH DISTINCT r, startNode(r) AS a, endNode(r) AS b
988
- RETURN
989
- properties(a) AS n_properties,
990
- properties(b) AS m_properties,
991
- type(r) AS type,
992
- properties(r) AS properties
993
- """
994
-
995
- result = await self.query(query, {"target_ids": target_ids})
996
-
997
- nodes_dict = {}
998
- edges = []
999
-
1000
- for record in result:
1001
- n_props = record["n_properties"]
1002
- m_props = record["m_properties"]
1003
- r_props = record["properties"]
1004
- r_type = record["type"]
1005
-
1006
- nodes_dict[n_props["id"]] = (n_props["id"], n_props)
1007
- nodes_dict[m_props["id"]] = (m_props["id"], m_props)
1008
-
1009
- source_id = r_props.get("source_node_id", n_props["id"])
1010
- target_id = r_props.get("target_node_id", m_props["id"])
1011
- edges.append((source_id, target_id, r_type, r_props))
1012
-
1013
- retrieval_time = time.time() - start_time
1014
- logger.info(
1015
- f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s"
1016
- )
1017
-
1018
- return list(nodes_dict.values()), edges
1019
-
1020
- except Exception as e:
1021
- logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}")
1022
- raise
1023
-
1024
967
  async def get_nodeset_subgraph(
1025
968
  self, node_type: Type[Any], node_name: List[str]
1026
969
  ) -> Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]]:
@@ -1527,25 +1470,3 @@ class Neo4jAdapter(GraphDBInterface):
1527
1470
  time_ids_list = [item["id"] for item in time_nodes if "id" in item]
1528
1471
 
1529
1472
  return ", ".join(f"'{uid}'" for uid in time_ids_list)
1530
-
1531
- async def get_triplets_batch(self, offset: int, limit: int) -> list[dict[str, Any]]:
1532
- """
1533
- Retrieve a batch of triplets (start_node, relationship, end_node) from the graph.
1534
-
1535
- Parameters:
1536
- -----------
1537
- - offset (int): Number of triplets to skip before returning results.
1538
- - limit (int): Maximum number of triplets to return.
1539
-
1540
- Returns:
1541
- --------
1542
- - list[dict[str, Any]]: A list of triplets.
1543
- """
1544
- query = f"""
1545
- MATCH (start_node:`{BASE_LABEL}`)-[relationship]->(end_node:`{BASE_LABEL}`)
1546
- RETURN start_node, properties(relationship) AS relationship_properties, end_node
1547
- SKIP $offset LIMIT $limit
1548
- """
1549
- results = await self.query(query, {"offset": offset, "limit": limit})
1550
-
1551
- return results
@@ -1,4 +1 @@
1
1
  from .get_or_create_dataset_database import get_or_create_dataset_database
2
- from .resolve_dataset_database_connection_info import resolve_dataset_database_connection_info
3
- from .get_graph_dataset_database_handler import get_graph_dataset_database_handler
4
- from .get_vector_dataset_database_handler import get_vector_dataset_database_handler
@@ -1,9 +1,11 @@
1
+ import os
1
2
  from uuid import UUID
2
- from typing import Union, Optional
3
+ from typing import Union
3
4
 
4
5
  from sqlalchemy import select
5
6
  from sqlalchemy.exc import IntegrityError
6
7
 
8
+ from cognee.base_config import get_base_config
7
9
  from cognee.modules.data.methods import create_dataset
8
10
  from cognee.infrastructure.databases.relational import get_relational_engine
9
11
  from cognee.infrastructure.databases.vector import get_vectordb_config
@@ -13,53 +15,6 @@ from cognee.modules.users.models import DatasetDatabase
13
15
  from cognee.modules.users.models import User
14
16
 
15
17
 
16
- async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict:
17
- vector_config = get_vectordb_config()
18
-
19
- from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
20
- supported_dataset_database_handlers,
21
- )
22
-
23
- handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler]
24
- return await handler["handler_instance"].create_dataset(dataset_id, user)
25
-
26
-
27
- async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict:
28
- graph_config = get_graph_config()
29
-
30
- from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
31
- supported_dataset_database_handlers,
32
- )
33
-
34
- handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler]
35
- return await handler["handler_instance"].create_dataset(dataset_id, user)
36
-
37
-
38
- async def _existing_dataset_database(
39
- dataset_id: UUID,
40
- user: User,
41
- ) -> Optional[DatasetDatabase]:
42
- """
43
- Check if a DatasetDatabase row already exists for the given owner + dataset.
44
- Return None if it doesn't exist, return the row if it does.
45
- Args:
46
- dataset_id:
47
- user:
48
-
49
- Returns:
50
- DatasetDatabase or None
51
- """
52
- db_engine = get_relational_engine()
53
-
54
- async with db_engine.get_async_session() as session:
55
- stmt = select(DatasetDatabase).where(
56
- DatasetDatabase.owner_id == user.id,
57
- DatasetDatabase.dataset_id == dataset_id,
58
- )
59
- existing: DatasetDatabase = await session.scalar(stmt)
60
- return existing
61
-
62
-
63
18
  async def get_or_create_dataset_database(
64
19
  dataset: Union[str, UUID],
65
20
  user: User,
@@ -70,8 +25,6 @@ async def get_or_create_dataset_database(
70
25
  • If the row already exists, it is fetched and returned.
71
26
  • Otherwise a new one is created atomically and returned.
72
27
 
73
- DatasetDatabase row contains connection and provider info for vector and graph databases.
74
-
75
28
  Parameters
76
29
  ----------
77
30
  user : User
@@ -83,26 +36,59 @@ async def get_or_create_dataset_database(
83
36
 
84
37
  dataset_id = await get_unique_dataset_id(dataset, user)
85
38
 
86
- # If dataset is given as name make sure the dataset is created first
87
- if isinstance(dataset, str):
88
- async with db_engine.get_async_session() as session:
89
- await create_dataset(dataset, user, session)
39
+ vector_config = get_vectordb_config()
40
+ graph_config = get_graph_config()
90
41
 
91
- # If dataset database already exists return it
92
- existing_dataset_database = await _existing_dataset_database(dataset_id, user)
93
- if existing_dataset_database:
94
- return existing_dataset_database
42
+ # Note: for hybrid databases both graph and vector DB name have to be the same
43
+ if graph_config.graph_database_provider == "kuzu":
44
+ graph_db_name = f"{dataset_id}.pkl"
45
+ else:
46
+ graph_db_name = f"{dataset_id}"
95
47
 
96
- graph_config_dict = await _get_graph_db_info(dataset_id, user)
97
- vector_config_dict = await _get_vector_db_info(dataset_id, user)
48
+ if vector_config.vector_db_provider == "lancedb":
49
+ vector_db_name = f"{dataset_id}.lance.db"
50
+ else:
51
+ vector_db_name = f"{dataset_id}"
52
+
53
+ base_config = get_base_config()
54
+ databases_directory_path = os.path.join(
55
+ base_config.system_root_directory, "databases", str(user.id)
56
+ )
57
+
58
+ # Determine vector database URL
59
+ if vector_config.vector_db_provider == "lancedb":
60
+ vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name)
61
+ else:
62
+ vector_db_url = vector_config.vector_database_url
63
+
64
+ # Determine graph database URL
98
65
 
99
66
  async with db_engine.get_async_session() as session:
67
+ # Create dataset if it doesn't exist
68
+ if isinstance(dataset, str):
69
+ dataset = await create_dataset(dataset, user, session)
70
+
71
+ # Try to fetch an existing row first
72
+ stmt = select(DatasetDatabase).where(
73
+ DatasetDatabase.owner_id == user.id,
74
+ DatasetDatabase.dataset_id == dataset_id,
75
+ )
76
+ existing: DatasetDatabase = await session.scalar(stmt)
77
+ if existing:
78
+ return existing
79
+
100
80
  # If there are no existing rows build a new row
101
81
  record = DatasetDatabase(
102
82
  owner_id=user.id,
103
83
  dataset_id=dataset_id,
104
- **graph_config_dict, # Unpack graph db config
105
- **vector_config_dict, # Unpack vector db config
84
+ vector_database_name=vector_db_name,
85
+ graph_database_name=graph_db_name,
86
+ vector_database_provider=vector_config.vector_db_provider,
87
+ graph_database_provider=graph_config.graph_database_provider,
88
+ vector_database_url=vector_db_url,
89
+ graph_database_url=graph_config.graph_database_url,
90
+ vector_database_key=vector_config.vector_db_key,
91
+ graph_database_key=graph_config.graph_database_key,
106
92
  )
107
93
 
108
94
  try:
@@ -28,7 +28,6 @@ class VectorConfig(BaseSettings):
28
28
  vector_db_name: str = ""
29
29
  vector_db_key: str = ""
30
30
  vector_db_provider: str = "lancedb"
31
- vector_dataset_database_handler: str = "lancedb"
32
31
 
33
32
  model_config = SettingsConfigDict(env_file=".env", extra="allow")
34
33
 
@@ -64,7 +63,6 @@ class VectorConfig(BaseSettings):
64
63
  "vector_db_name": self.vector_db_name,
65
64
  "vector_db_key": self.vector_db_key,
66
65
  "vector_db_provider": self.vector_db_provider,
67
- "vector_dataset_database_handler": self.vector_dataset_database_handler,
68
66
  }
69
67
 
70
68
 
@@ -12,7 +12,6 @@ def create_vector_engine(
12
12
  vector_db_name: str,
13
13
  vector_db_port: str = "",
14
14
  vector_db_key: str = "",
15
- vector_dataset_database_handler: str = "",
16
15
  ):
17
16
  """
18
17
  Create a vector database engine based on the specified provider.
@@ -17,7 +17,6 @@ from cognee.infrastructure.databases.exceptions import EmbeddingException
17
17
  from cognee.infrastructure.llm.tokenizer.TikToken import (
18
18
  TikTokenTokenizer,
19
19
  )
20
- from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
21
20
 
22
21
  litellm.set_verbose = False
23
22
  logger = get_logger("FastembedEmbeddingEngine")
@@ -69,7 +68,7 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
69
68
 
70
69
  @retry(
71
70
  stop=stop_after_delay(128),
72
- wait=wait_exponential_jitter(8, 128),
71
+ wait=wait_exponential_jitter(2, 128),
73
72
  retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
74
73
  before_sleep=before_sleep_log(logger, logging.DEBUG),
75
74
  reraise=True,
@@ -97,12 +96,11 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
97
96
  if self.mock:
98
97
  return [[0.0] * self.dimensions for _ in text]
99
98
  else:
100
- async with embedding_rate_limiter_context_manager():
101
- embeddings = self.embedding_model.embed(
102
- text,
103
- batch_size=len(text),
104
- parallel=None,
105
- )
99
+ embeddings = self.embedding_model.embed(
100
+ text,
101
+ batch_size=len(text),
102
+ parallel=None,
103
+ )
106
104
 
107
105
  return list(embeddings)
108
106
 
@@ -25,7 +25,6 @@ from cognee.infrastructure.llm.tokenizer.Mistral import (
25
25
  from cognee.infrastructure.llm.tokenizer.TikToken import (
26
26
  TikTokenTokenizer,
27
27
  )
28
- from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
29
28
 
30
29
  litellm.set_verbose = False
31
30
  logger = get_logger("LiteLLMEmbeddingEngine")
@@ -110,14 +109,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
110
109
  response = {"data": [{"embedding": [0.0] * self.dimensions} for _ in text]}
111
110
  return [data["embedding"] for data in response["data"]]
112
111
  else:
113
- async with embedding_rate_limiter_context_manager():
114
- response = await litellm.aembedding(
115
- model=self.model,
116
- input=text,
117
- api_key=self.api_key,
118
- api_base=self.endpoint,
119
- api_version=self.api_version,
120
- )
112
+ response = await litellm.aembedding(
113
+ model=self.model,
114
+ input=text,
115
+ api_key=self.api_key,
116
+ api_base=self.endpoint,
117
+ api_version=self.api_version,
118
+ )
121
119
 
122
120
  return [data["embedding"] for data in response.data]
123
121
 
@@ -18,7 +18,10 @@ from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import Em
18
18
  from cognee.infrastructure.llm.tokenizer.HuggingFace import (
19
19
  HuggingFaceTokenizer,
20
20
  )
21
- from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
21
+ from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
22
+ embedding_rate_limit_async,
23
+ embedding_sleep_and_retry_async,
24
+ )
22
25
  from cognee.shared.utils import create_secure_ssl_context
23
26
 
24
27
  logger = get_logger("OllamaEmbeddingEngine")
@@ -98,7 +101,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
98
101
 
99
102
  @retry(
100
103
  stop=stop_after_delay(128),
101
- wait=wait_exponential_jitter(8, 128),
104
+ wait=wait_exponential_jitter(2, 128),
102
105
  retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
103
106
  before_sleep=before_sleep_log(logger, logging.DEBUG),
104
107
  reraise=True,
@@ -117,15 +120,11 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
117
120
  ssl_context = create_secure_ssl_context()
118
121
  connector = aiohttp.TCPConnector(ssl=ssl_context)
119
122
  async with aiohttp.ClientSession(connector=connector) as session:
120
- async with embedding_rate_limiter_context_manager():
121
- async with session.post(
122
- self.endpoint, json=payload, headers=headers, timeout=60.0
123
- ) as response:
124
- data = await response.json()
125
- if "embeddings" in data:
126
- return data["embeddings"][0]
127
- else:
128
- return data["data"][0]["embedding"]
123
+ async with session.post(
124
+ self.endpoint, json=payload, headers=headers, timeout=60.0
125
+ ) as response:
126
+ data = await response.json()
127
+ return data["embeddings"][0]
129
128
 
130
129
  def get_vector_size(self) -> int:
131
130
  """