cognee 0.5.0.dev0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +1 -5
- cognee/api/v1/add/add.py +2 -1
- cognee/api/v1/cognify/cognify.py +24 -16
- cognee/api/v1/cognify/routers/__init__.py +0 -1
- cognee/api/v1/cognify/routers/get_cognify_router.py +3 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
- cognee/api/v1/ontologies/ontologies.py +12 -37
- cognee/api/v1/ontologies/routers/get_ontology_router.py +27 -25
- cognee/api/v1/search/search.py +8 -0
- cognee/api/v1/ui/node_setup.py +360 -0
- cognee/api/v1/ui/npm_utils.py +50 -0
- cognee/api/v1/ui/ui.py +38 -68
- cognee/context_global_variables.py +61 -16
- cognee/eval_framework/Dockerfile +29 -0
- cognee/eval_framework/answer_generation/answer_generation_executor.py +10 -0
- cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
- cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +0 -2
- cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
- cognee/eval_framework/eval_config.py +2 -2
- cognee/eval_framework/modal_run_eval.py +16 -28
- cognee/infrastructure/databases/dataset_database_handler/__init__.py +3 -0
- cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +80 -0
- cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +18 -0
- cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/graph/config.py +3 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -0
- cognee/infrastructure/databases/graph/graph_db_interface.py +15 -0
- cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +81 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +228 -0
- cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +168 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +80 -1
- cognee/infrastructure/databases/utils/__init__.py +3 -0
- cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +62 -48
- cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +30 -0
- cognee/infrastructure/databases/vector/config.py +2 -0
- cognee/infrastructure/databases/vector/create_vector_engine.py +1 -0
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +8 -6
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +9 -7
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -10
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +2 -0
- cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +50 -0
- cognee/infrastructure/databases/vector/vector_db_interface.py +35 -0
- cognee/infrastructure/files/storage/s3_config.py +2 -0
- cognee/infrastructure/llm/LLMGateway.py +5 -2
- cognee/infrastructure/llm/config.py +35 -0
- cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +23 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +17 -16
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +5 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +153 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +40 -37
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +39 -36
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +19 -1
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +11 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +23 -21
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +42 -34
- cognee/memify_pipelines/create_triplet_embeddings.py +53 -0
- cognee/modules/cognify/config.py +2 -0
- cognee/modules/data/deletion/prune_system.py +52 -2
- cognee/modules/data/methods/delete_dataset.py +26 -0
- cognee/modules/engine/models/Triplet.py +9 -0
- cognee/modules/engine/models/__init__.py +1 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +85 -37
- cognee/modules/graph/cognee_graph/CogneeGraphElements.py +8 -3
- cognee/modules/memify/memify.py +1 -7
- cognee/modules/pipelines/operations/pipeline.py +18 -2
- cognee/modules/retrieval/__init__.py +1 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +4 -0
- cognee/modules/retrieval/graph_completion_cot_retriever.py +4 -0
- cognee/modules/retrieval/graph_completion_retriever.py +10 -0
- cognee/modules/retrieval/graph_summary_completion_retriever.py +4 -0
- cognee/modules/retrieval/register_retriever.py +10 -0
- cognee/modules/retrieval/registered_community_retrievers.py +1 -0
- cognee/modules/retrieval/temporal_retriever.py +4 -0
- cognee/modules/retrieval/triplet_retriever.py +182 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +42 -10
- cognee/modules/run_custom_pipeline/run_custom_pipeline.py +8 -1
- cognee/modules/search/methods/get_search_type_tools.py +54 -8
- cognee/modules/search/methods/no_access_control_search.py +4 -0
- cognee/modules/search/methods/search.py +46 -18
- cognee/modules/search/types/SearchType.py +1 -1
- cognee/modules/settings/get_settings.py +19 -0
- cognee/modules/users/methods/get_authenticated_user.py +2 -2
- cognee/modules/users/models/DatasetDatabase.py +15 -3
- cognee/shared/logging_utils.py +4 -0
- cognee/shared/rate_limiting.py +30 -0
- cognee/tasks/documents/__init__.py +0 -1
- cognee/tasks/graph/extract_graph_from_data.py +9 -10
- cognee/tasks/memify/get_triplet_datapoints.py +289 -0
- cognee/tasks/storage/add_data_points.py +142 -2
- cognee/tests/integration/retrieval/test_triplet_retriever.py +84 -0
- cognee/tests/integration/tasks/test_add_data_points.py +139 -0
- cognee/tests/integration/tasks/test_get_triplet_datapoints.py +69 -0
- cognee/tests/test_cognee_server_start.py +2 -4
- cognee/tests/test_conversation_history.py +23 -1
- cognee/tests/test_dataset_database_handler.py +137 -0
- cognee/tests/test_dataset_delete.py +76 -0
- cognee/tests/test_edge_centered_payload.py +170 -0
- cognee/tests/test_pipeline_cache.py +164 -0
- cognee/tests/test_search_db.py +37 -1
- cognee/tests/unit/api/test_ontology_endpoint.py +77 -89
- cognee/tests/unit/infrastructure/llm/test_llm_config.py +46 -0
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +3 -7
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +0 -5
- cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
- cognee/tests/unit/modules/graph/cognee_graph_test.py +406 -0
- cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +214 -0
- cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +608 -0
- cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +83 -0
- cognee/tests/unit/modules/search/test_search.py +100 -0
- cognee/tests/unit/tasks/storage/test_add_data_points.py +288 -0
- {cognee-0.5.0.dev0.dist-info → cognee-0.5.1.dist-info}/METADATA +76 -89
- {cognee-0.5.0.dev0.dist-info → cognee-0.5.1.dist-info}/RECORD +119 -97
- {cognee-0.5.0.dev0.dist-info → cognee-0.5.1.dist-info}/WHEEL +1 -1
- cognee/api/v1/cognify/code_graph_pipeline.py +0 -119
- cognee/api/v1/cognify/routers/get_code_pipeline_router.py +0 -90
- cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +0 -544
- cognee/modules/retrieval/code_retriever.py +0 -232
- cognee/tasks/code/enrich_dependency_graph_checker.py +0 -35
- cognee/tasks/code/get_local_dependencies_checker.py +0 -20
- cognee/tasks/code/get_repo_dependency_graph_checker.py +0 -35
- cognee/tasks/documents/check_permissions_on_dataset.py +0 -26
- cognee/tasks/repo_processor/__init__.py +0 -2
- cognee/tasks/repo_processor/get_local_dependencies.py +0 -335
- cognee/tasks/repo_processor/get_non_code_files.py +0 -158
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +0 -243
- cognee/tests/test_delete_bmw_example.py +0 -60
- {cognee-0.5.0.dev0.dist-info → cognee-0.5.1.dist-info}/entry_points.txt +0 -0
- {cognee-0.5.0.dev0.dist-info → cognee-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.5.0.dev0.dist-info → cognee-0.5.1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -1,17 +1,67 @@
|
|
|
1
|
+
from sqlalchemy.exc import OperationalError
|
|
2
|
+
|
|
3
|
+
from cognee.infrastructure.databases.exceptions import EntityNotFoundError
|
|
4
|
+
from cognee.context_global_variables import backend_access_control_enabled
|
|
1
5
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
2
6
|
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
|
|
3
7
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
8
|
+
from cognee.infrastructure.databases.utils import (
|
|
9
|
+
get_graph_dataset_database_handler,
|
|
10
|
+
get_vector_dataset_database_handler,
|
|
11
|
+
)
|
|
4
12
|
from cognee.shared.cache import delete_cache
|
|
13
|
+
from cognee.modules.users.models import DatasetDatabase
|
|
14
|
+
from cognee.shared.logging_utils import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def prune_graph_databases():
|
|
20
|
+
db_engine = get_relational_engine()
|
|
21
|
+
try:
|
|
22
|
+
dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
|
|
23
|
+
# Go through each dataset database and delete the graph database
|
|
24
|
+
for dataset_database in dataset_databases:
|
|
25
|
+
handler = get_graph_dataset_database_handler(dataset_database)
|
|
26
|
+
await handler["handler_instance"].delete_dataset(dataset_database)
|
|
27
|
+
except (OperationalError, EntityNotFoundError) as e:
|
|
28
|
+
logger.debug(
|
|
29
|
+
"Skipping pruning of graph DB. Error when accessing dataset_database table: %s",
|
|
30
|
+
e,
|
|
31
|
+
)
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def prune_vector_databases():
|
|
36
|
+
db_engine = get_relational_engine()
|
|
37
|
+
try:
|
|
38
|
+
dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
|
|
39
|
+
# Go through each dataset database and delete the vector database
|
|
40
|
+
for dataset_database in dataset_databases:
|
|
41
|
+
handler = get_vector_dataset_database_handler(dataset_database)
|
|
42
|
+
await handler["handler_instance"].delete_dataset(dataset_database)
|
|
43
|
+
except (OperationalError, EntityNotFoundError) as e:
|
|
44
|
+
logger.debug(
|
|
45
|
+
"Skipping pruning of vector DB. Error when accessing dataset_database table: %s",
|
|
46
|
+
e,
|
|
47
|
+
)
|
|
48
|
+
return
|
|
5
49
|
|
|
6
50
|
|
|
7
51
|
async def prune_system(graph=True, vector=True, metadata=True, cache=True):
|
|
8
|
-
|
|
52
|
+
# Note: prune system should not be available through the API, it has no permission checks and will
|
|
53
|
+
# delete all graph and vector databases if called. It should only be used in development or testing environments.
|
|
54
|
+
if graph and not backend_access_control_enabled():
|
|
9
55
|
graph_engine = await get_graph_engine()
|
|
10
56
|
await graph_engine.delete_graph()
|
|
57
|
+
elif graph and backend_access_control_enabled():
|
|
58
|
+
await prune_graph_databases()
|
|
11
59
|
|
|
12
|
-
if vector:
|
|
60
|
+
if vector and not backend_access_control_enabled():
|
|
13
61
|
vector_engine = get_vector_engine()
|
|
14
62
|
await vector_engine.prune()
|
|
63
|
+
elif vector and backend_access_control_enabled():
|
|
64
|
+
await prune_vector_databases()
|
|
15
65
|
|
|
16
66
|
if metadata:
|
|
17
67
|
db_engine = get_relational_engine()
|
|
@@ -1,8 +1,34 @@
|
|
|
1
|
+
from cognee.modules.users.models import DatasetDatabase
|
|
2
|
+
from sqlalchemy import select
|
|
3
|
+
|
|
1
4
|
from cognee.modules.data.models import Dataset
|
|
5
|
+
from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import (
|
|
6
|
+
get_vector_dataset_database_handler,
|
|
7
|
+
)
|
|
8
|
+
from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import (
|
|
9
|
+
get_graph_dataset_database_handler,
|
|
10
|
+
)
|
|
2
11
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
3
12
|
|
|
4
13
|
|
|
5
14
|
async def delete_dataset(dataset: Dataset):
|
|
6
15
|
db_engine = get_relational_engine()
|
|
7
16
|
|
|
17
|
+
async with db_engine.get_async_session() as session:
|
|
18
|
+
stmt = select(DatasetDatabase).where(
|
|
19
|
+
DatasetDatabase.dataset_id == dataset.id,
|
|
20
|
+
)
|
|
21
|
+
dataset_database: DatasetDatabase = await session.scalar(stmt)
|
|
22
|
+
if dataset_database:
|
|
23
|
+
graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database)
|
|
24
|
+
vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database)
|
|
25
|
+
await graph_dataset_database_handler["handler_instance"].delete_dataset(
|
|
26
|
+
dataset_database
|
|
27
|
+
)
|
|
28
|
+
await vector_dataset_database_handler["handler_instance"].delete_dataset(
|
|
29
|
+
dataset_database
|
|
30
|
+
)
|
|
31
|
+
# TODO: Remove dataset from pipeline_run_status in Data objects related to dataset as well
|
|
32
|
+
# This blocks recreation of the dataset with the same name and data after deletion as
|
|
33
|
+
# it's marked as completed and will be just skipped even though it's empty.
|
|
8
34
|
return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)
|
|
@@ -56,6 +56,68 @@ class CogneeGraph(CogneeAbstractGraph):
|
|
|
56
56
|
def get_edges(self) -> List[Edge]:
|
|
57
57
|
return self.edges
|
|
58
58
|
|
|
59
|
+
async def _get_nodeset_subgraph(
|
|
60
|
+
self,
|
|
61
|
+
adapter,
|
|
62
|
+
node_type,
|
|
63
|
+
node_name,
|
|
64
|
+
):
|
|
65
|
+
"""Retrieve subgraph based on node type and name."""
|
|
66
|
+
logger.info("Retrieving graph filtered by node type and node name (NodeSet).")
|
|
67
|
+
nodes_data, edges_data = await adapter.get_nodeset_subgraph(
|
|
68
|
+
node_type=node_type, node_name=node_name
|
|
69
|
+
)
|
|
70
|
+
if not nodes_data or not edges_data:
|
|
71
|
+
raise EntityNotFoundError(
|
|
72
|
+
message="Nodeset does not exist, or empty nodeset projected from the database."
|
|
73
|
+
)
|
|
74
|
+
return nodes_data, edges_data
|
|
75
|
+
|
|
76
|
+
async def _get_full_or_id_filtered_graph(
|
|
77
|
+
self,
|
|
78
|
+
adapter,
|
|
79
|
+
relevant_ids_to_filter,
|
|
80
|
+
):
|
|
81
|
+
"""Retrieve full or ID-filtered graph with fallback."""
|
|
82
|
+
if relevant_ids_to_filter is None:
|
|
83
|
+
logger.info("Retrieving full graph.")
|
|
84
|
+
nodes_data, edges_data = await adapter.get_graph_data()
|
|
85
|
+
if not nodes_data or not edges_data:
|
|
86
|
+
raise EntityNotFoundError(message="Empty graph projected from the database.")
|
|
87
|
+
return nodes_data, edges_data
|
|
88
|
+
|
|
89
|
+
get_graph_data_fn = getattr(adapter, "get_id_filtered_graph_data", adapter.get_graph_data)
|
|
90
|
+
if getattr(adapter.__class__, "get_id_filtered_graph_data", None):
|
|
91
|
+
logger.info("Retrieving ID-filtered graph from database.")
|
|
92
|
+
nodes_data, edges_data = await get_graph_data_fn(target_ids=relevant_ids_to_filter)
|
|
93
|
+
else:
|
|
94
|
+
logger.info("Retrieving full graph from database.")
|
|
95
|
+
nodes_data, edges_data = await get_graph_data_fn()
|
|
96
|
+
if hasattr(adapter, "get_id_filtered_graph_data") and (not nodes_data or not edges_data):
|
|
97
|
+
logger.warning(
|
|
98
|
+
"Id filtered graph returned empty, falling back to full graph retrieval."
|
|
99
|
+
)
|
|
100
|
+
logger.info("Retrieving full graph")
|
|
101
|
+
nodes_data, edges_data = await adapter.get_graph_data()
|
|
102
|
+
|
|
103
|
+
if not nodes_data or not edges_data:
|
|
104
|
+
raise EntityNotFoundError("Empty graph projected from the database.")
|
|
105
|
+
return nodes_data, edges_data
|
|
106
|
+
|
|
107
|
+
async def _get_filtered_graph(
|
|
108
|
+
self,
|
|
109
|
+
adapter,
|
|
110
|
+
memory_fragment_filter,
|
|
111
|
+
):
|
|
112
|
+
"""Retrieve graph filtered by attributes."""
|
|
113
|
+
logger.info("Retrieving graph filtered by memory fragment")
|
|
114
|
+
nodes_data, edges_data = await adapter.get_filtered_graph_data(
|
|
115
|
+
attribute_filters=memory_fragment_filter
|
|
116
|
+
)
|
|
117
|
+
if not nodes_data or not edges_data:
|
|
118
|
+
raise EntityNotFoundError(message="Empty filtered graph projected from the database.")
|
|
119
|
+
return nodes_data, edges_data
|
|
120
|
+
|
|
59
121
|
async def project_graph_from_db(
|
|
60
122
|
self,
|
|
61
123
|
adapter: Union[GraphDBInterface],
|
|
@@ -67,40 +129,39 @@ class CogneeGraph(CogneeAbstractGraph):
|
|
|
67
129
|
memory_fragment_filter=[],
|
|
68
130
|
node_type: Optional[Type] = None,
|
|
69
131
|
node_name: Optional[List[str]] = None,
|
|
132
|
+
relevant_ids_to_filter: Optional[List[str]] = None,
|
|
133
|
+
triplet_distance_penalty: float = 3.5,
|
|
70
134
|
) -> None:
|
|
71
135
|
if node_dimension < 1 or edge_dimension < 1:
|
|
72
136
|
raise InvalidDimensionsError()
|
|
73
137
|
try:
|
|
74
|
-
import time
|
|
75
|
-
|
|
76
|
-
start_time = time.time()
|
|
77
|
-
|
|
78
|
-
# Determine projection strategy
|
|
79
138
|
if node_type is not None and node_name not in [None, [], ""]:
|
|
80
|
-
nodes_data, edges_data = await
|
|
81
|
-
node_type
|
|
139
|
+
nodes_data, edges_data = await self._get_nodeset_subgraph(
|
|
140
|
+
adapter, node_type, node_name
|
|
82
141
|
)
|
|
83
|
-
if not nodes_data or not edges_data:
|
|
84
|
-
raise EntityNotFoundError(
|
|
85
|
-
message="Nodeset does not exist, or empty nodetes projected from the database."
|
|
86
|
-
)
|
|
87
142
|
elif len(memory_fragment_filter) == 0:
|
|
88
|
-
nodes_data, edges_data = await
|
|
89
|
-
|
|
90
|
-
|
|
143
|
+
nodes_data, edges_data = await self._get_full_or_id_filtered_graph(
|
|
144
|
+
adapter, relevant_ids_to_filter
|
|
145
|
+
)
|
|
91
146
|
else:
|
|
92
|
-
nodes_data, edges_data = await
|
|
93
|
-
|
|
147
|
+
nodes_data, edges_data = await self._get_filtered_graph(
|
|
148
|
+
adapter, memory_fragment_filter
|
|
94
149
|
)
|
|
95
|
-
if not nodes_data or not edges_data:
|
|
96
|
-
raise EntityNotFoundError(
|
|
97
|
-
message="Empty filtered graph projected from the database."
|
|
98
|
-
)
|
|
99
150
|
|
|
151
|
+
import time
|
|
152
|
+
|
|
153
|
+
start_time = time.time()
|
|
100
154
|
# Process nodes
|
|
101
155
|
for node_id, properties in nodes_data:
|
|
102
156
|
node_attributes = {key: properties.get(key) for key in node_properties_to_project}
|
|
103
|
-
self.add_node(
|
|
157
|
+
self.add_node(
|
|
158
|
+
Node(
|
|
159
|
+
str(node_id),
|
|
160
|
+
node_attributes,
|
|
161
|
+
dimension=node_dimension,
|
|
162
|
+
node_penalty=triplet_distance_penalty,
|
|
163
|
+
)
|
|
164
|
+
)
|
|
104
165
|
|
|
105
166
|
# Process edges
|
|
106
167
|
for source_id, target_id, relationship_type, properties in edges_data:
|
|
@@ -118,6 +179,7 @@ class CogneeGraph(CogneeAbstractGraph):
|
|
|
118
179
|
attributes=edge_attributes,
|
|
119
180
|
directed=directed,
|
|
120
181
|
dimension=edge_dimension,
|
|
182
|
+
edge_penalty=triplet_distance_penalty,
|
|
121
183
|
)
|
|
122
184
|
self.add_edge(edge)
|
|
123
185
|
|
|
@@ -149,24 +211,10 @@ class CogneeGraph(CogneeAbstractGraph):
|
|
|
149
211
|
node.add_attribute("vector_distance", score)
|
|
150
212
|
mapped_nodes += 1
|
|
151
213
|
|
|
152
|
-
async def map_vector_distances_to_graph_edges(
|
|
153
|
-
self, vector_engine, query_vector, edge_distances
|
|
154
|
-
) -> None:
|
|
214
|
+
async def map_vector_distances_to_graph_edges(self, edge_distances) -> None:
|
|
155
215
|
try:
|
|
156
|
-
if query_vector is None or len(query_vector) == 0:
|
|
157
|
-
raise ValueError("Failed to generate query embedding.")
|
|
158
|
-
|
|
159
216
|
if edge_distances is None:
|
|
160
|
-
|
|
161
|
-
edge_distances = await vector_engine.search(
|
|
162
|
-
collection_name="EdgeType_relationship_name",
|
|
163
|
-
query_vector=query_vector,
|
|
164
|
-
limit=None,
|
|
165
|
-
)
|
|
166
|
-
projection_time = time.time() - start_time
|
|
167
|
-
logger.info(
|
|
168
|
-
f"Edge collection distances were calculated separately from nodes in {projection_time:.2f}s"
|
|
169
|
-
)
|
|
217
|
+
return
|
|
170
218
|
|
|
171
219
|
embedding_map = {result.payload["text"]: result.score for result in edge_distances}
|
|
172
220
|
|
|
@@ -20,13 +20,17 @@ class Node:
|
|
|
20
20
|
status: np.ndarray
|
|
21
21
|
|
|
22
22
|
def __init__(
|
|
23
|
-
self,
|
|
23
|
+
self,
|
|
24
|
+
node_id: str,
|
|
25
|
+
attributes: Optional[Dict[str, Any]] = None,
|
|
26
|
+
dimension: int = 1,
|
|
27
|
+
node_penalty: float = 3.5,
|
|
24
28
|
):
|
|
25
29
|
if dimension <= 0:
|
|
26
30
|
raise InvalidDimensionsError()
|
|
27
31
|
self.id = node_id
|
|
28
32
|
self.attributes = attributes if attributes is not None else {}
|
|
29
|
-
self.attributes["vector_distance"] =
|
|
33
|
+
self.attributes["vector_distance"] = node_penalty
|
|
30
34
|
self.skeleton_neighbours = []
|
|
31
35
|
self.skeleton_edges = []
|
|
32
36
|
self.status = np.ones(dimension, dtype=int)
|
|
@@ -105,13 +109,14 @@ class Edge:
|
|
|
105
109
|
attributes: Optional[Dict[str, Any]] = None,
|
|
106
110
|
directed: bool = True,
|
|
107
111
|
dimension: int = 1,
|
|
112
|
+
edge_penalty: float = 3.5,
|
|
108
113
|
):
|
|
109
114
|
if dimension <= 0:
|
|
110
115
|
raise InvalidDimensionsError()
|
|
111
116
|
self.node1 = node1
|
|
112
117
|
self.node2 = node2
|
|
113
118
|
self.attributes = attributes if attributes is not None else {}
|
|
114
|
-
self.attributes["vector_distance"] =
|
|
119
|
+
self.attributes["vector_distance"] = edge_penalty
|
|
115
120
|
self.directed = directed
|
|
116
121
|
self.status = np.ones(dimension, dtype=int)
|
|
117
122
|
|
cognee/modules/memify/memify.py
CHANGED
|
@@ -12,9 +12,6 @@ from cognee.modules.users.models import User
|
|
|
12
12
|
from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
|
|
13
13
|
resolve_authorized_user_datasets,
|
|
14
14
|
)
|
|
15
|
-
from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
|
|
16
|
-
reset_dataset_pipeline_run_status,
|
|
17
|
-
)
|
|
18
15
|
from cognee.modules.engine.operations.setup import setup
|
|
19
16
|
from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor
|
|
20
17
|
from cognee.tasks.memify.extract_subgraph_chunks import extract_subgraph_chunks
|
|
@@ -97,10 +94,6 @@ async def memify(
|
|
|
97
94
|
*enrichment_tasks,
|
|
98
95
|
]
|
|
99
96
|
|
|
100
|
-
await reset_dataset_pipeline_run_status(
|
|
101
|
-
authorized_dataset.id, user, pipeline_names=["memify_pipeline"]
|
|
102
|
-
)
|
|
103
|
-
|
|
104
97
|
# By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for
|
|
105
98
|
pipeline_executor_func = get_pipeline_executor(run_in_background=run_in_background)
|
|
106
99
|
|
|
@@ -113,6 +106,7 @@ async def memify(
|
|
|
113
106
|
datasets=authorized_dataset.id,
|
|
114
107
|
vector_db_config=vector_db_config,
|
|
115
108
|
graph_db_config=graph_db_config,
|
|
109
|
+
use_pipeline_cache=False,
|
|
116
110
|
incremental_loading=False,
|
|
117
111
|
pipeline_name="memify_pipeline",
|
|
118
112
|
)
|
|
@@ -20,6 +20,9 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
|
|
|
20
20
|
from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
|
|
21
21
|
check_pipeline_run_qualification,
|
|
22
22
|
)
|
|
23
|
+
from cognee.modules.pipelines.models.PipelineRunInfo import (
|
|
24
|
+
PipelineRunStarted,
|
|
25
|
+
)
|
|
23
26
|
from typing import Any
|
|
24
27
|
|
|
25
28
|
logger = get_logger("cognee.pipeline")
|
|
@@ -35,6 +38,7 @@ async def run_pipeline(
|
|
|
35
38
|
pipeline_name: str = "custom_pipeline",
|
|
36
39
|
vector_db_config: dict = None,
|
|
37
40
|
graph_db_config: dict = None,
|
|
41
|
+
use_pipeline_cache: bool = False,
|
|
38
42
|
incremental_loading: bool = False,
|
|
39
43
|
data_per_batch: int = 20,
|
|
40
44
|
):
|
|
@@ -51,6 +55,7 @@ async def run_pipeline(
|
|
|
51
55
|
data=data,
|
|
52
56
|
pipeline_name=pipeline_name,
|
|
53
57
|
context={"dataset": dataset},
|
|
58
|
+
use_pipeline_cache=use_pipeline_cache,
|
|
54
59
|
incremental_loading=incremental_loading,
|
|
55
60
|
data_per_batch=data_per_batch,
|
|
56
61
|
):
|
|
@@ -64,6 +69,7 @@ async def run_pipeline_per_dataset(
|
|
|
64
69
|
data=None,
|
|
65
70
|
pipeline_name: str = "custom_pipeline",
|
|
66
71
|
context: dict = None,
|
|
72
|
+
use_pipeline_cache=False,
|
|
67
73
|
incremental_loading=False,
|
|
68
74
|
data_per_batch: int = 20,
|
|
69
75
|
):
|
|
@@ -77,8 +83,18 @@ async def run_pipeline_per_dataset(
|
|
|
77
83
|
if process_pipeline_status:
|
|
78
84
|
# If pipeline was already processed or is currently being processed
|
|
79
85
|
# return status information to async generator and finish execution
|
|
80
|
-
|
|
81
|
-
|
|
86
|
+
if use_pipeline_cache:
|
|
87
|
+
# If pipeline caching is enabled we do not proceed with re-processing
|
|
88
|
+
yield process_pipeline_status
|
|
89
|
+
return
|
|
90
|
+
else:
|
|
91
|
+
# If pipeline caching is disabled we always return pipeline started information and proceed with re-processing
|
|
92
|
+
yield PipelineRunStarted(
|
|
93
|
+
pipeline_run_id=process_pipeline_status.pipeline_run_id,
|
|
94
|
+
dataset_id=dataset.id,
|
|
95
|
+
dataset_name=dataset.name,
|
|
96
|
+
payload=data,
|
|
97
|
+
)
|
|
82
98
|
|
|
83
99
|
pipeline_run = run_tasks(
|
|
84
100
|
tasks,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
|
|
@@ -39,6 +39,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
|
|
|
39
39
|
node_type: Optional[Type] = None,
|
|
40
40
|
node_name: Optional[List[str]] = None,
|
|
41
41
|
save_interaction: bool = False,
|
|
42
|
+
wide_search_top_k: Optional[int] = 100,
|
|
43
|
+
triplet_distance_penalty: Optional[float] = 3.5,
|
|
42
44
|
):
|
|
43
45
|
super().__init__(
|
|
44
46
|
user_prompt_path=user_prompt_path,
|
|
@@ -48,6 +50,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
|
|
|
48
50
|
node_name=node_name,
|
|
49
51
|
save_interaction=save_interaction,
|
|
50
52
|
system_prompt=system_prompt,
|
|
53
|
+
wide_search_top_k=wide_search_top_k,
|
|
54
|
+
triplet_distance_penalty=triplet_distance_penalty,
|
|
51
55
|
)
|
|
52
56
|
|
|
53
57
|
async def get_completion(
|
|
@@ -65,6 +65,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
|
|
|
65
65
|
node_type: Optional[Type] = None,
|
|
66
66
|
node_name: Optional[List[str]] = None,
|
|
67
67
|
save_interaction: bool = False,
|
|
68
|
+
wide_search_top_k: Optional[int] = 100,
|
|
69
|
+
triplet_distance_penalty: Optional[float] = 3.5,
|
|
68
70
|
):
|
|
69
71
|
super().__init__(
|
|
70
72
|
user_prompt_path=user_prompt_path,
|
|
@@ -74,6 +76,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
|
|
|
74
76
|
node_type=node_type,
|
|
75
77
|
node_name=node_name,
|
|
76
78
|
save_interaction=save_interaction,
|
|
79
|
+
wide_search_top_k=wide_search_top_k,
|
|
80
|
+
triplet_distance_penalty=triplet_distance_penalty,
|
|
77
81
|
)
|
|
78
82
|
self.validation_system_prompt_path = validation_system_prompt_path
|
|
79
83
|
self.validation_user_prompt_path = validation_user_prompt_path
|
|
@@ -47,6 +47,8 @@ class GraphCompletionRetriever(BaseGraphRetriever):
|
|
|
47
47
|
node_type: Optional[Type] = None,
|
|
48
48
|
node_name: Optional[List[str]] = None,
|
|
49
49
|
save_interaction: bool = False,
|
|
50
|
+
wide_search_top_k: Optional[int] = 100,
|
|
51
|
+
triplet_distance_penalty: Optional[float] = 3.5,
|
|
50
52
|
):
|
|
51
53
|
"""Initialize retriever with prompt paths and search parameters."""
|
|
52
54
|
self.save_interaction = save_interaction
|
|
@@ -54,8 +56,10 @@ class GraphCompletionRetriever(BaseGraphRetriever):
|
|
|
54
56
|
self.system_prompt_path = system_prompt_path
|
|
55
57
|
self.system_prompt = system_prompt
|
|
56
58
|
self.top_k = top_k if top_k is not None else 5
|
|
59
|
+
self.wide_search_top_k = wide_search_top_k
|
|
57
60
|
self.node_type = node_type
|
|
58
61
|
self.node_name = node_name
|
|
62
|
+
self.triplet_distance_penalty = triplet_distance_penalty
|
|
59
63
|
|
|
60
64
|
async def resolve_edges_to_text(self, retrieved_edges: list) -> str:
|
|
61
65
|
"""
|
|
@@ -105,6 +109,8 @@ class GraphCompletionRetriever(BaseGraphRetriever):
|
|
|
105
109
|
collections=vector_index_collections or None,
|
|
106
110
|
node_type=self.node_type,
|
|
107
111
|
node_name=self.node_name,
|
|
112
|
+
wide_search_top_k=self.wide_search_top_k,
|
|
113
|
+
triplet_distance_penalty=self.triplet_distance_penalty,
|
|
108
114
|
)
|
|
109
115
|
|
|
110
116
|
return found_triplets
|
|
@@ -141,6 +147,10 @@ class GraphCompletionRetriever(BaseGraphRetriever):
|
|
|
141
147
|
|
|
142
148
|
return triplets
|
|
143
149
|
|
|
150
|
+
async def convert_retrieved_objects_to_context(self, triplets: List[Edge]):
|
|
151
|
+
context = await self.resolve_edges_to_text(triplets)
|
|
152
|
+
return context
|
|
153
|
+
|
|
144
154
|
async def get_completion(
|
|
145
155
|
self,
|
|
146
156
|
query: str,
|
|
@@ -26,6 +26,8 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever):
|
|
|
26
26
|
node_type: Optional[Type] = None,
|
|
27
27
|
node_name: Optional[List[str]] = None,
|
|
28
28
|
save_interaction: bool = False,
|
|
29
|
+
wide_search_top_k: Optional[int] = 100,
|
|
30
|
+
triplet_distance_penalty: Optional[float] = 3.5,
|
|
29
31
|
):
|
|
30
32
|
"""Initialize retriever with default prompt paths and search parameters."""
|
|
31
33
|
super().__init__(
|
|
@@ -36,6 +38,8 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever):
|
|
|
36
38
|
node_name=node_name,
|
|
37
39
|
save_interaction=save_interaction,
|
|
38
40
|
system_prompt=system_prompt,
|
|
41
|
+
wide_search_top_k=wide_search_top_k,
|
|
42
|
+
triplet_distance_penalty=triplet_distance_penalty,
|
|
39
43
|
)
|
|
40
44
|
self.summarize_prompt_path = summarize_prompt_path
|
|
41
45
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from .base_retriever import BaseRetriever
|
|
4
|
+
from .registered_community_retrievers import registered_community_retrievers
|
|
5
|
+
from ..search.types import SearchType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def use_retriever(search_type: SearchType, retriever: Type[BaseRetriever]):
|
|
9
|
+
"""Register a retriever class for a given search type."""
|
|
10
|
+
registered_community_retrievers[search_type] = retriever
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
registered_community_retrievers = {}
|
|
@@ -47,6 +47,8 @@ class TemporalRetriever(GraphCompletionRetriever):
|
|
|
47
47
|
top_k: Optional[int] = 5,
|
|
48
48
|
node_type: Optional[Type] = None,
|
|
49
49
|
node_name: Optional[List[str]] = None,
|
|
50
|
+
wide_search_top_k: Optional[int] = 100,
|
|
51
|
+
triplet_distance_penalty: Optional[float] = 3.5,
|
|
50
52
|
):
|
|
51
53
|
super().__init__(
|
|
52
54
|
user_prompt_path=user_prompt_path,
|
|
@@ -54,6 +56,8 @@ class TemporalRetriever(GraphCompletionRetriever):
|
|
|
54
56
|
top_k=top_k,
|
|
55
57
|
node_type=node_type,
|
|
56
58
|
node_name=node_name,
|
|
59
|
+
wide_search_top_k=wide_search_top_k,
|
|
60
|
+
triplet_distance_penalty=triplet_distance_penalty,
|
|
57
61
|
)
|
|
58
62
|
self.user_prompt_path = user_prompt_path
|
|
59
63
|
self.system_prompt_path = system_prompt_path
|