cognee 0.5.0__py3-none-any.whl → 0.5.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +5 -1
- cognee/api/v1/add/add.py +1 -2
- cognee/api/v1/cognify/code_graph_pipeline.py +119 -0
- cognee/api/v1/cognify/cognify.py +16 -24
- cognee/api/v1/cognify/routers/__init__.py +1 -0
- cognee/api/v1/cognify/routers/get_code_pipeline_router.py +90 -0
- cognee/api/v1/cognify/routers/get_cognify_router.py +1 -3
- cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
- cognee/api/v1/ontologies/ontologies.py +37 -12
- cognee/api/v1/ontologies/routers/get_ontology_router.py +25 -27
- cognee/api/v1/search/search.py +0 -4
- cognee/api/v1/ui/ui.py +68 -38
- cognee/context_global_variables.py +16 -61
- cognee/eval_framework/answer_generation/answer_generation_executor.py +0 -10
- cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
- cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +2 -0
- cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
- cognee/eval_framework/eval_config.py +2 -2
- cognee/eval_framework/modal_run_eval.py +28 -16
- cognee/infrastructure/databases/graph/config.py +0 -3
- cognee/infrastructure/databases/graph/get_graph_engine.py +0 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +0 -15
- cognee/infrastructure/databases/graph/kuzu/adapter.py +0 -228
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +1 -80
- cognee/infrastructure/databases/utils/__init__.py +0 -3
- cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +48 -62
- cognee/infrastructure/databases/vector/config.py +0 -2
- cognee/infrastructure/databases/vector/create_vector_engine.py +0 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +6 -8
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +7 -9
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +10 -11
- cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +544 -0
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -2
- cognee/infrastructure/databases/vector/vector_db_interface.py +0 -35
- cognee/infrastructure/files/storage/s3_config.py +0 -2
- cognee/infrastructure/llm/LLMGateway.py +2 -5
- cognee/infrastructure/llm/config.py +0 -35
- cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +8 -23
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +16 -17
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +37 -40
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +36 -39
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +1 -19
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +9 -11
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +21 -23
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +34 -42
- cognee/modules/cognify/config.py +0 -2
- cognee/modules/data/deletion/prune_system.py +2 -52
- cognee/modules/data/methods/delete_dataset.py +0 -26
- cognee/modules/engine/models/__init__.py +0 -1
- cognee/modules/graph/cognee_graph/CogneeGraph.py +37 -85
- cognee/modules/graph/cognee_graph/CogneeGraphElements.py +3 -8
- cognee/modules/memify/memify.py +7 -1
- cognee/modules/pipelines/operations/pipeline.py +2 -18
- cognee/modules/retrieval/__init__.py +1 -1
- cognee/modules/retrieval/code_retriever.py +232 -0
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -4
- cognee/modules/retrieval/graph_completion_cot_retriever.py +0 -4
- cognee/modules/retrieval/graph_completion_retriever.py +0 -10
- cognee/modules/retrieval/graph_summary_completion_retriever.py +0 -4
- cognee/modules/retrieval/temporal_retriever.py +0 -4
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +10 -42
- cognee/modules/run_custom_pipeline/run_custom_pipeline.py +1 -8
- cognee/modules/search/methods/get_search_type_tools.py +8 -54
- cognee/modules/search/methods/no_access_control_search.py +0 -4
- cognee/modules/search/methods/search.py +0 -21
- cognee/modules/search/types/SearchType.py +1 -1
- cognee/modules/settings/get_settings.py +0 -19
- cognee/modules/users/methods/get_authenticated_user.py +2 -2
- cognee/modules/users/models/DatasetDatabase.py +3 -15
- cognee/shared/logging_utils.py +0 -4
- cognee/tasks/code/enrich_dependency_graph_checker.py +35 -0
- cognee/tasks/code/get_local_dependencies_checker.py +20 -0
- cognee/tasks/code/get_repo_dependency_graph_checker.py +35 -0
- cognee/tasks/documents/__init__.py +1 -0
- cognee/tasks/documents/check_permissions_on_dataset.py +26 -0
- cognee/tasks/graph/extract_graph_from_data.py +10 -9
- cognee/tasks/repo_processor/__init__.py +2 -0
- cognee/tasks/repo_processor/get_local_dependencies.py +335 -0
- cognee/tasks/repo_processor/get_non_code_files.py +158 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +243 -0
- cognee/tasks/storage/add_data_points.py +2 -142
- cognee/tests/test_cognee_server_start.py +4 -2
- cognee/tests/test_conversation_history.py +1 -23
- cognee/tests/test_delete_bmw_example.py +60 -0
- cognee/tests/test_search_db.py +1 -37
- cognee/tests/unit/api/test_ontology_endpoint.py +89 -77
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +7 -3
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -0
- cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
- cognee/tests/unit/modules/graph/cognee_graph_test.py +0 -406
- {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/METADATA +89 -76
- {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/RECORD +97 -118
- {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/WHEEL +1 -1
- cognee/api/v1/ui/node_setup.py +0 -360
- cognee/api/v1/ui/npm_utils.py +0 -50
- cognee/eval_framework/Dockerfile +0 -29
- cognee/infrastructure/databases/dataset_database_handler/__init__.py +0 -3
- cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +0 -80
- cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +0 -18
- cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +0 -10
- cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +0 -81
- cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +0 -168
- cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +0 -10
- cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +0 -10
- cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +0 -30
- cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +0 -50
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +0 -153
- cognee/memify_pipelines/create_triplet_embeddings.py +0 -53
- cognee/modules/engine/models/Triplet.py +0 -9
- cognee/modules/retrieval/register_retriever.py +0 -10
- cognee/modules/retrieval/registered_community_retrievers.py +0 -1
- cognee/modules/retrieval/triplet_retriever.py +0 -182
- cognee/shared/rate_limiting.py +0 -30
- cognee/tasks/memify/get_triplet_datapoints.py +0 -289
- cognee/tests/integration/retrieval/test_triplet_retriever.py +0 -84
- cognee/tests/integration/tasks/test_add_data_points.py +0 -139
- cognee/tests/integration/tasks/test_get_triplet_datapoints.py +0 -69
- cognee/tests/test_dataset_database_handler.py +0 -137
- cognee/tests/test_dataset_delete.py +0 -76
- cognee/tests/test_edge_centered_payload.py +0 -170
- cognee/tests/test_pipeline_cache.py +0 -164
- cognee/tests/unit/infrastructure/llm/test_llm_config.py +0 -46
- cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +0 -214
- cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +0 -608
- cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +0 -83
- cognee/tests/unit/tasks/storage/test_add_data_points.py +0 -288
- {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/entry_points.txt +0 -0
- {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -12,7 +12,6 @@ from contextlib import asynccontextmanager
|
|
|
12
12
|
from concurrent.futures import ThreadPoolExecutor
|
|
13
13
|
from typing import Dict, Any, List, Union, Optional, Tuple, Type
|
|
14
14
|
|
|
15
|
-
from cognee.exceptions import CogneeValidationError
|
|
16
15
|
from cognee.shared.logging_utils import get_logger
|
|
17
16
|
from cognee.infrastructure.utils.run_sync import run_sync
|
|
18
17
|
from cognee.infrastructure.files.storage import get_file_storage
|
|
@@ -1187,11 +1186,6 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
1187
1186
|
A tuple with two elements: a list of tuples of (node_id, properties) and a list of
|
|
1188
1187
|
tuples of (source_id, target_id, relationship_name, properties).
|
|
1189
1188
|
"""
|
|
1190
|
-
|
|
1191
|
-
import time
|
|
1192
|
-
|
|
1193
|
-
start_time = time.time()
|
|
1194
|
-
|
|
1195
1189
|
try:
|
|
1196
1190
|
nodes_query = """
|
|
1197
1191
|
MATCH (n:Node)
|
|
@@ -1255,11 +1249,6 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
1255
1249
|
},
|
|
1256
1250
|
)
|
|
1257
1251
|
)
|
|
1258
|
-
|
|
1259
|
-
retrieval_time = time.time() - start_time
|
|
1260
|
-
logger.info(
|
|
1261
|
-
f"Retrieved {len(nodes)} nodes and {len(edges)} edges in {retrieval_time:.2f} seconds"
|
|
1262
|
-
)
|
|
1263
1252
|
return formatted_nodes, formatted_edges
|
|
1264
1253
|
except Exception as e:
|
|
1265
1254
|
logger.error(f"Failed to get graph data: {e}")
|
|
@@ -1428,92 +1417,6 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
1428
1417
|
formatted_edges.append((source_id, target_id, rel_type, props))
|
|
1429
1418
|
return formatted_nodes, formatted_edges
|
|
1430
1419
|
|
|
1431
|
-
async def get_id_filtered_graph_data(self, target_ids: list[str]):
|
|
1432
|
-
"""
|
|
1433
|
-
Retrieve graph data filtered by specific node IDs, including their direct neighbors
|
|
1434
|
-
and only edges where one endpoint matches those IDs.
|
|
1435
|
-
|
|
1436
|
-
Returns:
|
|
1437
|
-
nodes: List[dict] -> Each dict includes "id" and all node properties
|
|
1438
|
-
edges: List[dict] -> Each dict includes "source", "target", "type", "properties"
|
|
1439
|
-
"""
|
|
1440
|
-
import time
|
|
1441
|
-
|
|
1442
|
-
start_time = time.time()
|
|
1443
|
-
|
|
1444
|
-
try:
|
|
1445
|
-
if not target_ids:
|
|
1446
|
-
logger.warning("No target IDs provided for ID-filtered graph retrieval.")
|
|
1447
|
-
return [], []
|
|
1448
|
-
|
|
1449
|
-
if not all(isinstance(x, str) for x in target_ids):
|
|
1450
|
-
raise CogneeValidationError("target_ids must be a list of strings")
|
|
1451
|
-
|
|
1452
|
-
query = """
|
|
1453
|
-
MATCH (n:Node)-[r]->(m:Node)
|
|
1454
|
-
WHERE n.id IN $target_ids OR m.id IN $target_ids
|
|
1455
|
-
RETURN n.id, {
|
|
1456
|
-
name: n.name,
|
|
1457
|
-
type: n.type,
|
|
1458
|
-
properties: n.properties
|
|
1459
|
-
}, m.id, {
|
|
1460
|
-
name: m.name,
|
|
1461
|
-
type: m.type,
|
|
1462
|
-
properties: m.properties
|
|
1463
|
-
}, r.relationship_name, r.properties
|
|
1464
|
-
"""
|
|
1465
|
-
|
|
1466
|
-
result = await self.query(query, {"target_ids": target_ids})
|
|
1467
|
-
|
|
1468
|
-
if not result:
|
|
1469
|
-
logger.info("No data returned for the supplied IDs")
|
|
1470
|
-
return [], []
|
|
1471
|
-
|
|
1472
|
-
nodes_dict = {}
|
|
1473
|
-
edges = []
|
|
1474
|
-
|
|
1475
|
-
for n_id, n_props, m_id, m_props, r_type, r_props_raw in result:
|
|
1476
|
-
if n_props.get("properties"):
|
|
1477
|
-
try:
|
|
1478
|
-
additional_props = json.loads(n_props["properties"])
|
|
1479
|
-
n_props.update(additional_props)
|
|
1480
|
-
del n_props["properties"]
|
|
1481
|
-
except json.JSONDecodeError:
|
|
1482
|
-
logger.warning(f"Failed to parse properties JSON for node {n_id}")
|
|
1483
|
-
|
|
1484
|
-
if m_props.get("properties"):
|
|
1485
|
-
try:
|
|
1486
|
-
additional_props = json.loads(m_props["properties"])
|
|
1487
|
-
m_props.update(additional_props)
|
|
1488
|
-
del m_props["properties"]
|
|
1489
|
-
except json.JSONDecodeError:
|
|
1490
|
-
logger.warning(f"Failed to parse properties JSON for node {m_id}")
|
|
1491
|
-
|
|
1492
|
-
nodes_dict[n_id] = (n_id, n_props)
|
|
1493
|
-
nodes_dict[m_id] = (m_id, m_props)
|
|
1494
|
-
|
|
1495
|
-
edge_props = {}
|
|
1496
|
-
if r_props_raw:
|
|
1497
|
-
try:
|
|
1498
|
-
edge_props = json.loads(r_props_raw)
|
|
1499
|
-
except (json.JSONDecodeError, TypeError):
|
|
1500
|
-
logger.warning(f"Failed to parse edge properties for {n_id}->{m_id}")
|
|
1501
|
-
|
|
1502
|
-
source_id = edge_props.get("source_node_id", n_id)
|
|
1503
|
-
target_id = edge_props.get("target_node_id", m_id)
|
|
1504
|
-
edges.append((source_id, target_id, r_type, edge_props))
|
|
1505
|
-
|
|
1506
|
-
retrieval_time = time.time() - start_time
|
|
1507
|
-
logger.info(
|
|
1508
|
-
f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s"
|
|
1509
|
-
)
|
|
1510
|
-
|
|
1511
|
-
return list(nodes_dict.values()), edges
|
|
1512
|
-
|
|
1513
|
-
except Exception as e:
|
|
1514
|
-
logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}")
|
|
1515
|
-
raise
|
|
1516
|
-
|
|
1517
1420
|
async def get_graph_metrics(self, include_optional=False) -> Dict[str, Any]:
|
|
1518
1421
|
"""
|
|
1519
1422
|
Get metrics on graph structure and connectivity.
|
|
@@ -2005,134 +1908,3 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
2005
1908
|
time_ids_list = [item[0] for item in time_nodes]
|
|
2006
1909
|
|
|
2007
1910
|
return ", ".join(f"'{uid}'" for uid in time_ids_list)
|
|
2008
|
-
|
|
2009
|
-
async def get_triplets_batch(self, offset: int, limit: int) -> list[dict[str, Any]]:
|
|
2010
|
-
"""
|
|
2011
|
-
Retrieve a batch of triplets (start_node, relationship, end_node) from the graph.
|
|
2012
|
-
|
|
2013
|
-
Parameters:
|
|
2014
|
-
-----------
|
|
2015
|
-
- offset (int): Number of triplets to skip before returning results.
|
|
2016
|
-
- limit (int): Maximum number of triplets to return.
|
|
2017
|
-
|
|
2018
|
-
Returns:
|
|
2019
|
-
--------
|
|
2020
|
-
- list[dict[str, Any]]: A list of triplets, where each triplet is a dictionary
|
|
2021
|
-
with keys: 'start_node', 'relationship_properties', 'end_node'.
|
|
2022
|
-
|
|
2023
|
-
Raises:
|
|
2024
|
-
-------
|
|
2025
|
-
- ValueError: If offset or limit are negative.
|
|
2026
|
-
- Exception: Re-raises any exceptions from query execution.
|
|
2027
|
-
"""
|
|
2028
|
-
if offset < 0:
|
|
2029
|
-
raise ValueError(f"Offset must be non-negative, got {offset}")
|
|
2030
|
-
if limit < 0:
|
|
2031
|
-
raise ValueError(f"Limit must be non-negative, got {limit}")
|
|
2032
|
-
|
|
2033
|
-
query = """
|
|
2034
|
-
MATCH (start_node:Node)-[relationship:EDGE]->(end_node:Node)
|
|
2035
|
-
RETURN {
|
|
2036
|
-
start_node: {
|
|
2037
|
-
id: start_node.id,
|
|
2038
|
-
name: start_node.name,
|
|
2039
|
-
type: start_node.type,
|
|
2040
|
-
properties: start_node.properties
|
|
2041
|
-
},
|
|
2042
|
-
relationship_properties: {
|
|
2043
|
-
relationship_name: relationship.relationship_name,
|
|
2044
|
-
properties: relationship.properties
|
|
2045
|
-
},
|
|
2046
|
-
end_node: {
|
|
2047
|
-
id: end_node.id,
|
|
2048
|
-
name: end_node.name,
|
|
2049
|
-
type: end_node.type,
|
|
2050
|
-
properties: end_node.properties
|
|
2051
|
-
}
|
|
2052
|
-
} AS triplet
|
|
2053
|
-
SKIP $offset LIMIT $limit
|
|
2054
|
-
"""
|
|
2055
|
-
|
|
2056
|
-
try:
|
|
2057
|
-
results = await self.query(query, {"offset": offset, "limit": limit})
|
|
2058
|
-
except Exception as e:
|
|
2059
|
-
logger.error(f"Failed to execute triplet query: {str(e)}")
|
|
2060
|
-
logger.error(f"Query: {query}")
|
|
2061
|
-
logger.error(f"Parameters: offset={offset}, limit={limit}")
|
|
2062
|
-
raise
|
|
2063
|
-
|
|
2064
|
-
triplets = []
|
|
2065
|
-
for idx, row in enumerate(results):
|
|
2066
|
-
try:
|
|
2067
|
-
if not row or len(row) == 0:
|
|
2068
|
-
logger.warning(f"Skipping empty row at index {idx} in triplet batch")
|
|
2069
|
-
continue
|
|
2070
|
-
|
|
2071
|
-
if not isinstance(row[0], dict):
|
|
2072
|
-
logger.warning(
|
|
2073
|
-
f"Skipping invalid row at index {idx}: expected dict, got {type(row[0])}"
|
|
2074
|
-
)
|
|
2075
|
-
continue
|
|
2076
|
-
|
|
2077
|
-
triplet = row[0]
|
|
2078
|
-
|
|
2079
|
-
if "start_node" not in triplet:
|
|
2080
|
-
logger.warning(f"Skipping triplet at index {idx}: missing 'start_node' key")
|
|
2081
|
-
continue
|
|
2082
|
-
|
|
2083
|
-
if not isinstance(triplet["start_node"], dict):
|
|
2084
|
-
logger.warning(f"Skipping triplet at index {idx}: 'start_node' is not a dict")
|
|
2085
|
-
continue
|
|
2086
|
-
|
|
2087
|
-
triplet["start_node"] = self._parse_node_properties(triplet["start_node"].copy())
|
|
2088
|
-
|
|
2089
|
-
if "relationship_properties" not in triplet:
|
|
2090
|
-
logger.warning(
|
|
2091
|
-
f"Skipping triplet at index {idx}: missing 'relationship_properties' key"
|
|
2092
|
-
)
|
|
2093
|
-
continue
|
|
2094
|
-
|
|
2095
|
-
if not isinstance(triplet["relationship_properties"], dict):
|
|
2096
|
-
logger.warning(
|
|
2097
|
-
f"Skipping triplet at index {idx}: 'relationship_properties' is not a dict"
|
|
2098
|
-
)
|
|
2099
|
-
continue
|
|
2100
|
-
|
|
2101
|
-
rel_props = triplet["relationship_properties"].copy()
|
|
2102
|
-
relationship_name = rel_props.get("relationship_name") or ""
|
|
2103
|
-
|
|
2104
|
-
if rel_props.get("properties"):
|
|
2105
|
-
try:
|
|
2106
|
-
parsed_props = json.loads(rel_props["properties"])
|
|
2107
|
-
if isinstance(parsed_props, dict):
|
|
2108
|
-
rel_props.update(parsed_props)
|
|
2109
|
-
del rel_props["properties"]
|
|
2110
|
-
else:
|
|
2111
|
-
logger.warning(
|
|
2112
|
-
f"Parsed relationship properties is not a dict for triplet at index {idx}"
|
|
2113
|
-
)
|
|
2114
|
-
except (json.JSONDecodeError, TypeError) as e:
|
|
2115
|
-
logger.warning(
|
|
2116
|
-
f"Failed to parse relationship properties JSON for triplet at index {idx}: {e}"
|
|
2117
|
-
)
|
|
2118
|
-
|
|
2119
|
-
rel_props["relationship_name"] = relationship_name
|
|
2120
|
-
triplet["relationship_properties"] = rel_props
|
|
2121
|
-
|
|
2122
|
-
if "end_node" not in triplet:
|
|
2123
|
-
logger.warning(f"Skipping triplet at index {idx}: missing 'end_node' key")
|
|
2124
|
-
continue
|
|
2125
|
-
|
|
2126
|
-
if not isinstance(triplet["end_node"], dict):
|
|
2127
|
-
logger.warning(f"Skipping triplet at index {idx}: 'end_node' is not a dict")
|
|
2128
|
-
continue
|
|
2129
|
-
|
|
2130
|
-
triplet["end_node"] = self._parse_node_properties(triplet["end_node"].copy())
|
|
2131
|
-
|
|
2132
|
-
triplets.append(triplet)
|
|
2133
|
-
|
|
2134
|
-
except Exception as e:
|
|
2135
|
-
logger.error(f"Error processing triplet at index {idx}: {e}", exc_info=True)
|
|
2136
|
-
continue
|
|
2137
|
-
|
|
2138
|
-
return triplets
|
|
@@ -8,7 +8,7 @@ from neo4j import AsyncSession
|
|
|
8
8
|
from neo4j import AsyncGraphDatabase
|
|
9
9
|
from neo4j.exceptions import Neo4jError
|
|
10
10
|
from contextlib import asynccontextmanager
|
|
11
|
-
from typing import Optional, Any, List, Dict, Type, Tuple
|
|
11
|
+
from typing import Optional, Any, List, Dict, Type, Tuple
|
|
12
12
|
|
|
13
13
|
from cognee.infrastructure.engine import DataPoint
|
|
14
14
|
from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int
|
|
@@ -964,63 +964,6 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
964
964
|
logger.error(f"Error during graph data retrieval: {str(e)}")
|
|
965
965
|
raise
|
|
966
966
|
|
|
967
|
-
async def get_id_filtered_graph_data(self, target_ids: list[str]):
|
|
968
|
-
"""
|
|
969
|
-
Retrieve graph data filtered by specific node IDs, including their direct neighbors
|
|
970
|
-
and only edges where one endpoint matches those IDs.
|
|
971
|
-
|
|
972
|
-
This version uses a single Cypher query for efficiency.
|
|
973
|
-
"""
|
|
974
|
-
import time
|
|
975
|
-
|
|
976
|
-
start_time = time.time()
|
|
977
|
-
|
|
978
|
-
try:
|
|
979
|
-
if not target_ids:
|
|
980
|
-
logger.warning("No target IDs provided for ID-filtered graph retrieval.")
|
|
981
|
-
return [], []
|
|
982
|
-
|
|
983
|
-
query = """
|
|
984
|
-
MATCH ()-[r]-()
|
|
985
|
-
WHERE startNode(r).id IN $target_ids
|
|
986
|
-
OR endNode(r).id IN $target_ids
|
|
987
|
-
WITH DISTINCT r, startNode(r) AS a, endNode(r) AS b
|
|
988
|
-
RETURN
|
|
989
|
-
properties(a) AS n_properties,
|
|
990
|
-
properties(b) AS m_properties,
|
|
991
|
-
type(r) AS type,
|
|
992
|
-
properties(r) AS properties
|
|
993
|
-
"""
|
|
994
|
-
|
|
995
|
-
result = await self.query(query, {"target_ids": target_ids})
|
|
996
|
-
|
|
997
|
-
nodes_dict = {}
|
|
998
|
-
edges = []
|
|
999
|
-
|
|
1000
|
-
for record in result:
|
|
1001
|
-
n_props = record["n_properties"]
|
|
1002
|
-
m_props = record["m_properties"]
|
|
1003
|
-
r_props = record["properties"]
|
|
1004
|
-
r_type = record["type"]
|
|
1005
|
-
|
|
1006
|
-
nodes_dict[n_props["id"]] = (n_props["id"], n_props)
|
|
1007
|
-
nodes_dict[m_props["id"]] = (m_props["id"], m_props)
|
|
1008
|
-
|
|
1009
|
-
source_id = r_props.get("source_node_id", n_props["id"])
|
|
1010
|
-
target_id = r_props.get("target_node_id", m_props["id"])
|
|
1011
|
-
edges.append((source_id, target_id, r_type, r_props))
|
|
1012
|
-
|
|
1013
|
-
retrieval_time = time.time() - start_time
|
|
1014
|
-
logger.info(
|
|
1015
|
-
f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s"
|
|
1016
|
-
)
|
|
1017
|
-
|
|
1018
|
-
return list(nodes_dict.values()), edges
|
|
1019
|
-
|
|
1020
|
-
except Exception as e:
|
|
1021
|
-
logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}")
|
|
1022
|
-
raise
|
|
1023
|
-
|
|
1024
967
|
async def get_nodeset_subgraph(
|
|
1025
968
|
self, node_type: Type[Any], node_name: List[str]
|
|
1026
969
|
) -> Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]]:
|
|
@@ -1527,25 +1470,3 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
1527
1470
|
time_ids_list = [item["id"] for item in time_nodes if "id" in item]
|
|
1528
1471
|
|
|
1529
1472
|
return ", ".join(f"'{uid}'" for uid in time_ids_list)
|
|
1530
|
-
|
|
1531
|
-
async def get_triplets_batch(self, offset: int, limit: int) -> list[dict[str, Any]]:
|
|
1532
|
-
"""
|
|
1533
|
-
Retrieve a batch of triplets (start_node, relationship, end_node) from the graph.
|
|
1534
|
-
|
|
1535
|
-
Parameters:
|
|
1536
|
-
-----------
|
|
1537
|
-
- offset (int): Number of triplets to skip before returning results.
|
|
1538
|
-
- limit (int): Maximum number of triplets to return.
|
|
1539
|
-
|
|
1540
|
-
Returns:
|
|
1541
|
-
--------
|
|
1542
|
-
- list[dict[str, Any]]: A list of triplets.
|
|
1543
|
-
"""
|
|
1544
|
-
query = f"""
|
|
1545
|
-
MATCH (start_node:`{BASE_LABEL}`)-[relationship]->(end_node:`{BASE_LABEL}`)
|
|
1546
|
-
RETURN start_node, properties(relationship) AS relationship_properties, end_node
|
|
1547
|
-
SKIP $offset LIMIT $limit
|
|
1548
|
-
"""
|
|
1549
|
-
results = await self.query(query, {"offset": offset, "limit": limit})
|
|
1550
|
-
|
|
1551
|
-
return results
|
|
@@ -1,4 +1 @@
|
|
|
1
1
|
from .get_or_create_dataset_database import get_or_create_dataset_database
|
|
2
|
-
from .resolve_dataset_database_connection_info import resolve_dataset_database_connection_info
|
|
3
|
-
from .get_graph_dataset_database_handler import get_graph_dataset_database_handler
|
|
4
|
-
from .get_vector_dataset_database_handler import get_vector_dataset_database_handler
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from uuid import UUID
|
|
2
|
-
from typing import Union
|
|
3
|
+
from typing import Union
|
|
3
4
|
|
|
4
5
|
from sqlalchemy import select
|
|
5
6
|
from sqlalchemy.exc import IntegrityError
|
|
6
7
|
|
|
8
|
+
from cognee.base_config import get_base_config
|
|
7
9
|
from cognee.modules.data.methods import create_dataset
|
|
8
10
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
9
11
|
from cognee.infrastructure.databases.vector import get_vectordb_config
|
|
@@ -13,53 +15,6 @@ from cognee.modules.users.models import DatasetDatabase
|
|
|
13
15
|
from cognee.modules.users.models import User
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict:
|
|
17
|
-
vector_config = get_vectordb_config()
|
|
18
|
-
|
|
19
|
-
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
|
|
20
|
-
supported_dataset_database_handlers,
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler]
|
|
24
|
-
return await handler["handler_instance"].create_dataset(dataset_id, user)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict:
|
|
28
|
-
graph_config = get_graph_config()
|
|
29
|
-
|
|
30
|
-
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
|
|
31
|
-
supported_dataset_database_handlers,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler]
|
|
35
|
-
return await handler["handler_instance"].create_dataset(dataset_id, user)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
async def _existing_dataset_database(
|
|
39
|
-
dataset_id: UUID,
|
|
40
|
-
user: User,
|
|
41
|
-
) -> Optional[DatasetDatabase]:
|
|
42
|
-
"""
|
|
43
|
-
Check if a DatasetDatabase row already exists for the given owner + dataset.
|
|
44
|
-
Return None if it doesn't exist, return the row if it does.
|
|
45
|
-
Args:
|
|
46
|
-
dataset_id:
|
|
47
|
-
user:
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
DatasetDatabase or None
|
|
51
|
-
"""
|
|
52
|
-
db_engine = get_relational_engine()
|
|
53
|
-
|
|
54
|
-
async with db_engine.get_async_session() as session:
|
|
55
|
-
stmt = select(DatasetDatabase).where(
|
|
56
|
-
DatasetDatabase.owner_id == user.id,
|
|
57
|
-
DatasetDatabase.dataset_id == dataset_id,
|
|
58
|
-
)
|
|
59
|
-
existing: DatasetDatabase = await session.scalar(stmt)
|
|
60
|
-
return existing
|
|
61
|
-
|
|
62
|
-
|
|
63
18
|
async def get_or_create_dataset_database(
|
|
64
19
|
dataset: Union[str, UUID],
|
|
65
20
|
user: User,
|
|
@@ -70,8 +25,6 @@ async def get_or_create_dataset_database(
|
|
|
70
25
|
• If the row already exists, it is fetched and returned.
|
|
71
26
|
• Otherwise a new one is created atomically and returned.
|
|
72
27
|
|
|
73
|
-
DatasetDatabase row contains connection and provider info for vector and graph databases.
|
|
74
|
-
|
|
75
28
|
Parameters
|
|
76
29
|
----------
|
|
77
30
|
user : User
|
|
@@ -83,26 +36,59 @@ async def get_or_create_dataset_database(
|
|
|
83
36
|
|
|
84
37
|
dataset_id = await get_unique_dataset_id(dataset, user)
|
|
85
38
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
async with db_engine.get_async_session() as session:
|
|
89
|
-
await create_dataset(dataset, user, session)
|
|
39
|
+
vector_config = get_vectordb_config()
|
|
40
|
+
graph_config = get_graph_config()
|
|
90
41
|
|
|
91
|
-
#
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
42
|
+
# Note: for hybrid databases both graph and vector DB name have to be the same
|
|
43
|
+
if graph_config.graph_database_provider == "kuzu":
|
|
44
|
+
graph_db_name = f"{dataset_id}.pkl"
|
|
45
|
+
else:
|
|
46
|
+
graph_db_name = f"{dataset_id}"
|
|
95
47
|
|
|
96
|
-
|
|
97
|
-
|
|
48
|
+
if vector_config.vector_db_provider == "lancedb":
|
|
49
|
+
vector_db_name = f"{dataset_id}.lance.db"
|
|
50
|
+
else:
|
|
51
|
+
vector_db_name = f"{dataset_id}"
|
|
52
|
+
|
|
53
|
+
base_config = get_base_config()
|
|
54
|
+
databases_directory_path = os.path.join(
|
|
55
|
+
base_config.system_root_directory, "databases", str(user.id)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Determine vector database URL
|
|
59
|
+
if vector_config.vector_db_provider == "lancedb":
|
|
60
|
+
vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name)
|
|
61
|
+
else:
|
|
62
|
+
vector_db_url = vector_config.vector_database_url
|
|
63
|
+
|
|
64
|
+
# Determine graph database URL
|
|
98
65
|
|
|
99
66
|
async with db_engine.get_async_session() as session:
|
|
67
|
+
# Create dataset if it doesn't exist
|
|
68
|
+
if isinstance(dataset, str):
|
|
69
|
+
dataset = await create_dataset(dataset, user, session)
|
|
70
|
+
|
|
71
|
+
# Try to fetch an existing row first
|
|
72
|
+
stmt = select(DatasetDatabase).where(
|
|
73
|
+
DatasetDatabase.owner_id == user.id,
|
|
74
|
+
DatasetDatabase.dataset_id == dataset_id,
|
|
75
|
+
)
|
|
76
|
+
existing: DatasetDatabase = await session.scalar(stmt)
|
|
77
|
+
if existing:
|
|
78
|
+
return existing
|
|
79
|
+
|
|
100
80
|
# If there are no existing rows build a new row
|
|
101
81
|
record = DatasetDatabase(
|
|
102
82
|
owner_id=user.id,
|
|
103
83
|
dataset_id=dataset_id,
|
|
104
|
-
|
|
105
|
-
|
|
84
|
+
vector_database_name=vector_db_name,
|
|
85
|
+
graph_database_name=graph_db_name,
|
|
86
|
+
vector_database_provider=vector_config.vector_db_provider,
|
|
87
|
+
graph_database_provider=graph_config.graph_database_provider,
|
|
88
|
+
vector_database_url=vector_db_url,
|
|
89
|
+
graph_database_url=graph_config.graph_database_url,
|
|
90
|
+
vector_database_key=vector_config.vector_db_key,
|
|
91
|
+
graph_database_key=graph_config.graph_database_key,
|
|
106
92
|
)
|
|
107
93
|
|
|
108
94
|
try:
|
|
@@ -28,7 +28,6 @@ class VectorConfig(BaseSettings):
|
|
|
28
28
|
vector_db_name: str = ""
|
|
29
29
|
vector_db_key: str = ""
|
|
30
30
|
vector_db_provider: str = "lancedb"
|
|
31
|
-
vector_dataset_database_handler: str = "lancedb"
|
|
32
31
|
|
|
33
32
|
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
|
34
33
|
|
|
@@ -64,7 +63,6 @@ class VectorConfig(BaseSettings):
|
|
|
64
63
|
"vector_db_name": self.vector_db_name,
|
|
65
64
|
"vector_db_key": self.vector_db_key,
|
|
66
65
|
"vector_db_provider": self.vector_db_provider,
|
|
67
|
-
"vector_dataset_database_handler": self.vector_dataset_database_handler,
|
|
68
66
|
}
|
|
69
67
|
|
|
70
68
|
|
|
@@ -17,7 +17,6 @@ from cognee.infrastructure.databases.exceptions import EmbeddingException
|
|
|
17
17
|
from cognee.infrastructure.llm.tokenizer.TikToken import (
|
|
18
18
|
TikTokenTokenizer,
|
|
19
19
|
)
|
|
20
|
-
from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
|
|
21
20
|
|
|
22
21
|
litellm.set_verbose = False
|
|
23
22
|
logger = get_logger("FastembedEmbeddingEngine")
|
|
@@ -69,7 +68,7 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
|
|
|
69
68
|
|
|
70
69
|
@retry(
|
|
71
70
|
stop=stop_after_delay(128),
|
|
72
|
-
wait=wait_exponential_jitter(
|
|
71
|
+
wait=wait_exponential_jitter(2, 128),
|
|
73
72
|
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
74
73
|
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
75
74
|
reraise=True,
|
|
@@ -97,12 +96,11 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
|
|
|
97
96
|
if self.mock:
|
|
98
97
|
return [[0.0] * self.dimensions for _ in text]
|
|
99
98
|
else:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
)
|
|
99
|
+
embeddings = self.embedding_model.embed(
|
|
100
|
+
text,
|
|
101
|
+
batch_size=len(text),
|
|
102
|
+
parallel=None,
|
|
103
|
+
)
|
|
106
104
|
|
|
107
105
|
return list(embeddings)
|
|
108
106
|
|
|
@@ -25,7 +25,6 @@ from cognee.infrastructure.llm.tokenizer.Mistral import (
|
|
|
25
25
|
from cognee.infrastructure.llm.tokenizer.TikToken import (
|
|
26
26
|
TikTokenTokenizer,
|
|
27
27
|
)
|
|
28
|
-
from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
|
|
29
28
|
|
|
30
29
|
litellm.set_verbose = False
|
|
31
30
|
logger = get_logger("LiteLLMEmbeddingEngine")
|
|
@@ -110,14 +109,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|
|
110
109
|
response = {"data": [{"embedding": [0.0] * self.dimensions} for _ in text]}
|
|
111
110
|
return [data["embedding"] for data in response["data"]]
|
|
112
111
|
else:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
)
|
|
112
|
+
response = await litellm.aembedding(
|
|
113
|
+
model=self.model,
|
|
114
|
+
input=text,
|
|
115
|
+
api_key=self.api_key,
|
|
116
|
+
api_base=self.endpoint,
|
|
117
|
+
api_version=self.api_version,
|
|
118
|
+
)
|
|
121
119
|
|
|
122
120
|
return [data["embedding"] for data in response.data]
|
|
123
121
|
|
|
@@ -18,7 +18,10 @@ from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import Em
|
|
|
18
18
|
from cognee.infrastructure.llm.tokenizer.HuggingFace import (
|
|
19
19
|
HuggingFaceTokenizer,
|
|
20
20
|
)
|
|
21
|
-
from cognee.
|
|
21
|
+
from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
|
|
22
|
+
embedding_rate_limit_async,
|
|
23
|
+
embedding_sleep_and_retry_async,
|
|
24
|
+
)
|
|
22
25
|
from cognee.shared.utils import create_secure_ssl_context
|
|
23
26
|
|
|
24
27
|
logger = get_logger("OllamaEmbeddingEngine")
|
|
@@ -98,7 +101,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
|
|
|
98
101
|
|
|
99
102
|
@retry(
|
|
100
103
|
stop=stop_after_delay(128),
|
|
101
|
-
wait=wait_exponential_jitter(
|
|
104
|
+
wait=wait_exponential_jitter(2, 128),
|
|
102
105
|
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
103
106
|
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
104
107
|
reraise=True,
|
|
@@ -117,15 +120,11 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
|
|
|
117
120
|
ssl_context = create_secure_ssl_context()
|
|
118
121
|
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
|
119
122
|
async with aiohttp.ClientSession(connector=connector) as session:
|
|
120
|
-
async with
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
if "embeddings" in data:
|
|
126
|
-
return data["embeddings"][0]
|
|
127
|
-
else:
|
|
128
|
-
return data["data"][0]["embedding"]
|
|
123
|
+
async with session.post(
|
|
124
|
+
self.endpoint, json=payload, headers=headers, timeout=60.0
|
|
125
|
+
) as response:
|
|
126
|
+
data = await response.json()
|
|
127
|
+
return data["embeddings"][0]
|
|
129
128
|
|
|
130
129
|
def get_vector_size(self) -> int:
|
|
131
130
|
"""
|