cognee 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +5 -1
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/ui/ui.py +47 -16
- cognee/api/v1/update/routers/get_update_router.py +1 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +5 -0
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +34 -2
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/METADATA +22 -7
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -128
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- distributed/poetry.lock +0 -12238
- distributed/pyproject.toml +0 -185
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
import json
|
|
5
5
|
import asyncio
|
|
6
6
|
import tempfile
|
|
7
|
-
from uuid import UUID
|
|
7
|
+
from uuid import UUID, uuid5, NAMESPACE_OID
|
|
8
8
|
from kuzu import Connection
|
|
9
9
|
from kuzu.database import Database
|
|
10
10
|
from datetime import datetime, timezone
|
|
@@ -23,9 +23,14 @@ from cognee.infrastructure.engine import DataPoint
|
|
|
23
23
|
from cognee.modules.storage.utils import JSONEncoder
|
|
24
24
|
from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int
|
|
25
25
|
from cognee.tasks.temporal_graph.models import Timestamp
|
|
26
|
+
from cognee.infrastructure.databases.cache.config import get_cache_config
|
|
26
27
|
|
|
27
28
|
logger = get_logger()
|
|
28
29
|
|
|
30
|
+
cache_config = get_cache_config()
|
|
31
|
+
if cache_config.shared_kuzu_lock:
|
|
32
|
+
from cognee.infrastructure.databases.cache.get_cache_engine import get_cache_engine
|
|
33
|
+
|
|
29
34
|
|
|
30
35
|
class KuzuAdapter(GraphDBInterface):
|
|
31
36
|
"""
|
|
@@ -39,12 +44,20 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
39
44
|
|
|
40
45
|
def __init__(self, db_path: str):
|
|
41
46
|
"""Initialize Kuzu database connection and schema."""
|
|
47
|
+
self.open_connections = 0
|
|
48
|
+
self._is_closed = False
|
|
42
49
|
self.db_path = db_path # Path for the database directory
|
|
43
50
|
self.db: Optional[Database] = None
|
|
44
51
|
self.connection: Optional[Connection] = None
|
|
45
|
-
|
|
46
|
-
|
|
52
|
+
if cache_config.shared_kuzu_lock:
|
|
53
|
+
self.redis_lock = get_cache_engine(
|
|
54
|
+
lock_key="kuzu-lock-" + str(uuid5(NAMESPACE_OID, db_path))
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
self.executor = ThreadPoolExecutor()
|
|
58
|
+
self._initialize_connection()
|
|
47
59
|
self.KUZU_ASYNC_LOCK = asyncio.Lock()
|
|
60
|
+
self._connection_change_lock = asyncio.Lock()
|
|
48
61
|
|
|
49
62
|
def _initialize_connection(self) -> None:
|
|
50
63
|
"""Initialize the Kuzu database connection and schema."""
|
|
@@ -185,6 +198,15 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
185
198
|
except FileNotFoundError:
|
|
186
199
|
logger.warning(f"Kuzu S3 storage file not found: {self.db_path}")
|
|
187
200
|
|
|
201
|
+
async def is_empty(self) -> bool:
|
|
202
|
+
query = """
|
|
203
|
+
MATCH (n)
|
|
204
|
+
RETURN true
|
|
205
|
+
LIMIT 1;
|
|
206
|
+
"""
|
|
207
|
+
query_result = await self.query(query)
|
|
208
|
+
return len(query_result) == 0
|
|
209
|
+
|
|
188
210
|
async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]:
|
|
189
211
|
"""
|
|
190
212
|
Execute a Kuzu query asynchronously with automatic reconnection.
|
|
@@ -209,9 +231,13 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
209
231
|
params = params or {}
|
|
210
232
|
|
|
211
233
|
def blocking_query():
|
|
234
|
+
lock_acquired = False
|
|
212
235
|
try:
|
|
236
|
+
if cache_config.shared_kuzu_lock:
|
|
237
|
+
self.redis_lock.acquire_lock()
|
|
238
|
+
lock_acquired = True
|
|
213
239
|
if not self.connection:
|
|
214
|
-
logger.
|
|
240
|
+
logger.info("Reconnecting to Kuzu database...")
|
|
215
241
|
self._initialize_connection()
|
|
216
242
|
|
|
217
243
|
result = self.connection.execute(query, params)
|
|
@@ -225,12 +251,47 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
225
251
|
val = val.as_py()
|
|
226
252
|
processed_rows.append(val)
|
|
227
253
|
rows.append(tuple(processed_rows))
|
|
254
|
+
|
|
228
255
|
return rows
|
|
229
256
|
except Exception as e:
|
|
230
257
|
logger.error(f"Query execution failed: {str(e)}")
|
|
231
258
|
raise
|
|
232
|
-
|
|
233
|
-
|
|
259
|
+
finally:
|
|
260
|
+
if cache_config.shared_kuzu_lock and lock_acquired:
|
|
261
|
+
try:
|
|
262
|
+
self.close()
|
|
263
|
+
finally:
|
|
264
|
+
self.redis_lock.release_lock()
|
|
265
|
+
|
|
266
|
+
if cache_config.shared_kuzu_lock:
|
|
267
|
+
async with self._connection_change_lock:
|
|
268
|
+
self.open_connections += 1
|
|
269
|
+
logger.info(f"Open connections after open: {self.open_connections}")
|
|
270
|
+
try:
|
|
271
|
+
result = blocking_query()
|
|
272
|
+
finally:
|
|
273
|
+
self.open_connections -= 1
|
|
274
|
+
logger.info(f"Open connections after close: {self.open_connections}")
|
|
275
|
+
return result
|
|
276
|
+
else:
|
|
277
|
+
result = await loop.run_in_executor(self.executor, blocking_query)
|
|
278
|
+
return result
|
|
279
|
+
|
|
280
|
+
def close(self):
|
|
281
|
+
if self.connection:
|
|
282
|
+
del self.connection
|
|
283
|
+
self.connection = None
|
|
284
|
+
if self.db:
|
|
285
|
+
del self.db
|
|
286
|
+
self.db = None
|
|
287
|
+
self._is_closed = True
|
|
288
|
+
logger.info("Kuzu database closed successfully")
|
|
289
|
+
|
|
290
|
+
def reopen(self):
|
|
291
|
+
if self._is_closed:
|
|
292
|
+
self._is_closed = False
|
|
293
|
+
self._initialize_connection()
|
|
294
|
+
logger.info("Kuzu database re-opened successfully")
|
|
234
295
|
|
|
235
296
|
@asynccontextmanager
|
|
236
297
|
async def get_session(self):
|
|
@@ -1557,44 +1618,6 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
1557
1618
|
logger.error(f"Failed to delete graph data: {e}")
|
|
1558
1619
|
raise
|
|
1559
1620
|
|
|
1560
|
-
async def clear_database(self) -> None:
|
|
1561
|
-
"""
|
|
1562
|
-
Clear all data from the database by deleting the database files and reinitializing.
|
|
1563
|
-
|
|
1564
|
-
This method removes all files associated with the database and reinitializes the Kuzu
|
|
1565
|
-
database structure, ensuring a completely empty state. It handles exceptions that might
|
|
1566
|
-
occur during file deletions or initializations carefully.
|
|
1567
|
-
"""
|
|
1568
|
-
try:
|
|
1569
|
-
if self.connection:
|
|
1570
|
-
self.connection = None
|
|
1571
|
-
if self.db:
|
|
1572
|
-
self.db.close()
|
|
1573
|
-
self.db = None
|
|
1574
|
-
|
|
1575
|
-
db_dir = os.path.dirname(self.db_path)
|
|
1576
|
-
db_name = os.path.basename(self.db_path)
|
|
1577
|
-
file_storage = get_file_storage(db_dir)
|
|
1578
|
-
|
|
1579
|
-
if await file_storage.file_exists(db_name):
|
|
1580
|
-
await file_storage.remove_all()
|
|
1581
|
-
logger.info(f"Deleted Kuzu database files at {self.db_path}")
|
|
1582
|
-
|
|
1583
|
-
# Reinitialize the database
|
|
1584
|
-
self._initialize_connection()
|
|
1585
|
-
# Verify the database is empty
|
|
1586
|
-
result = self.connection.execute("MATCH (n:Node) RETURN COUNT(n)")
|
|
1587
|
-
count = result.get_next()[0] if result.has_next() else 0
|
|
1588
|
-
if count > 0:
|
|
1589
|
-
logger.warning(
|
|
1590
|
-
f"Database still contains {count} nodes after clearing, forcing deletion"
|
|
1591
|
-
)
|
|
1592
|
-
self.connection.execute("MATCH (n:Node) DETACH DELETE n")
|
|
1593
|
-
logger.info("Database cleared successfully")
|
|
1594
|
-
except Exception as e:
|
|
1595
|
-
logger.error(f"Error during database clearing: {e}")
|
|
1596
|
-
raise
|
|
1597
|
-
|
|
1598
1621
|
async def get_document_subgraph(self, data_id: str):
|
|
1599
1622
|
"""
|
|
1600
1623
|
Get all nodes that should be deleted when removing a document.
|
|
@@ -68,6 +68,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
68
68
|
auth=auth,
|
|
69
69
|
max_connection_lifetime=120,
|
|
70
70
|
notifications_min_severity="OFF",
|
|
71
|
+
keep_alive=True,
|
|
71
72
|
)
|
|
72
73
|
|
|
73
74
|
async def initialize(self) -> None:
|
|
@@ -86,6 +87,15 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
86
87
|
async with self.driver.session(database=self.graph_database_name) as session:
|
|
87
88
|
yield session
|
|
88
89
|
|
|
90
|
+
async def is_empty(self) -> bool:
|
|
91
|
+
query = """
|
|
92
|
+
RETURN EXISTS {
|
|
93
|
+
MATCH (n)
|
|
94
|
+
} AS node_exists;
|
|
95
|
+
"""
|
|
96
|
+
query_result = await self.query(query)
|
|
97
|
+
return not query_result[0]["node_exists"]
|
|
98
|
+
|
|
89
99
|
@deadlock_retry()
|
|
90
100
|
async def query(
|
|
91
101
|
self,
|
|
@@ -205,7 +215,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
205
215
|
{
|
|
206
216
|
"node_id": str(node.id),
|
|
207
217
|
"label": type(node).__name__,
|
|
208
|
-
"properties": self.serialize_properties(node
|
|
218
|
+
"properties": self.serialize_properties(dict(node)),
|
|
209
219
|
}
|
|
210
220
|
for node in nodes
|
|
211
221
|
]
|
|
@@ -1066,7 +1076,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
1066
1076
|
query_nodes = f"""
|
|
1067
1077
|
MATCH (n)
|
|
1068
1078
|
WHERE {where_clause}
|
|
1069
|
-
RETURN
|
|
1079
|
+
RETURN n.id AS id, labels(n) AS labels, properties(n) AS properties
|
|
1070
1080
|
"""
|
|
1071
1081
|
result_nodes = await self.query(query_nodes)
|
|
1072
1082
|
|
|
@@ -1081,7 +1091,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
1081
1091
|
query_edges = f"""
|
|
1082
1092
|
MATCH (n)-[r]->(m)
|
|
1083
1093
|
WHERE {where_clause} AND {where_clause.replace("n.", "m.")}
|
|
1084
|
-
RETURN
|
|
1094
|
+
RETURN n.id AS source, n.id AS target, TYPE(r) AS type, properties(r) AS properties
|
|
1085
1095
|
"""
|
|
1086
1096
|
result_edges = await self.query(query_edges)
|
|
1087
1097
|
|
|
@@ -8,7 +8,7 @@ from cognee.infrastructure.utils.calculate_backoff import calculate_backoff
|
|
|
8
8
|
logger = get_logger("deadlock_retry")
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def deadlock_retry(max_retries=
|
|
11
|
+
def deadlock_retry(max_retries=10):
|
|
12
12
|
"""
|
|
13
13
|
Decorator that automatically retries an asynchronous function when rate limit errors occur.
|
|
14
14
|
|
|
@@ -53,7 +53,7 @@ def parse_neptune_url(url: str) -> Tuple[str, str]:
|
|
|
53
53
|
return graph_id, region
|
|
54
54
|
|
|
55
55
|
except Exception as e:
|
|
56
|
-
raise ValueError(f"Failed to parse Neptune Analytics URL '{url}': {str(e)}")
|
|
56
|
+
raise ValueError(f"Failed to parse Neptune Analytics URL '{url}': {str(e)}") from e
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
def validate_graph_id(graph_id: str) -> bool:
|
|
@@ -283,7 +283,7 @@ class SQLAlchemyAdapter:
|
|
|
283
283
|
try:
|
|
284
284
|
data_entity = (await session.scalars(select(Data).where(Data.id == data_id))).one()
|
|
285
285
|
except (ValueError, NoResultFound) as e:
|
|
286
|
-
raise EntityNotFoundError(message=f"Entity not found: {str(e)}")
|
|
286
|
+
raise EntityNotFoundError(message=f"Entity not found: {str(e)}") from e
|
|
287
287
|
|
|
288
288
|
# Check if other data objects point to the same raw data location
|
|
289
289
|
raw_data_location_entities = (
|
|
@@ -1,8 +1,17 @@
|
|
|
1
|
-
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
2
3
|
from typing import List, Optional
|
|
3
4
|
from fastembed import TextEmbedding
|
|
4
5
|
import litellm
|
|
5
|
-
import
|
|
6
|
+
from tenacity import (
|
|
7
|
+
retry,
|
|
8
|
+
stop_after_delay,
|
|
9
|
+
wait_exponential_jitter,
|
|
10
|
+
retry_if_not_exception_type,
|
|
11
|
+
before_sleep_log,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from cognee.shared.logging_utils import get_logger
|
|
6
15
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
7
16
|
from cognee.infrastructure.databases.exceptions import EmbeddingException
|
|
8
17
|
from cognee.infrastructure.llm.tokenizer.TikToken import (
|
|
@@ -57,6 +66,13 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
|
|
|
57
66
|
enable_mocking = str(enable_mocking).lower()
|
|
58
67
|
self.mock = enable_mocking in ("true", "1", "yes")
|
|
59
68
|
|
|
69
|
+
@retry(
|
|
70
|
+
stop=stop_after_delay(128),
|
|
71
|
+
wait=wait_exponential_jitter(2, 128),
|
|
72
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
73
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
74
|
+
reraise=True,
|
|
75
|
+
)
|
|
60
76
|
async def embed_text(self, text: List[str]) -> List[List[float]]:
|
|
61
77
|
"""
|
|
62
78
|
Embed the given text into numerical vectors.
|
|
@@ -90,7 +106,9 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
|
|
|
90
106
|
|
|
91
107
|
except Exception as error:
|
|
92
108
|
logger.error(f"Embedding error in FastembedEmbeddingEngine: {str(error)}")
|
|
93
|
-
raise EmbeddingException(
|
|
109
|
+
raise EmbeddingException(
|
|
110
|
+
f"Failed to index data points using model {self.model}"
|
|
111
|
+
) from error
|
|
94
112
|
|
|
95
113
|
def get_vector_size(self) -> int:
|
|
96
114
|
"""
|
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
|
|
2
4
|
from cognee.shared.logging_utils import get_logger
|
|
3
5
|
from typing import List, Optional
|
|
4
6
|
import numpy as np
|
|
5
7
|
import math
|
|
8
|
+
from tenacity import (
|
|
9
|
+
retry,
|
|
10
|
+
stop_after_delay,
|
|
11
|
+
wait_exponential_jitter,
|
|
12
|
+
retry_if_not_exception_type,
|
|
13
|
+
before_sleep_log,
|
|
14
|
+
)
|
|
6
15
|
import litellm
|
|
7
16
|
import os
|
|
8
17
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
9
18
|
from cognee.infrastructure.databases.exceptions import EmbeddingException
|
|
10
|
-
from cognee.infrastructure.llm.tokenizer.Gemini import (
|
|
11
|
-
GeminiTokenizer,
|
|
12
|
-
)
|
|
13
19
|
from cognee.infrastructure.llm.tokenizer.HuggingFace import (
|
|
14
20
|
HuggingFaceTokenizer,
|
|
15
21
|
)
|
|
@@ -19,10 +25,6 @@ from cognee.infrastructure.llm.tokenizer.Mistral import (
|
|
|
19
25
|
from cognee.infrastructure.llm.tokenizer.TikToken import (
|
|
20
26
|
TikTokenTokenizer,
|
|
21
27
|
)
|
|
22
|
-
from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
|
|
23
|
-
embedding_rate_limit_async,
|
|
24
|
-
embedding_sleep_and_retry_async,
|
|
25
|
-
)
|
|
26
28
|
|
|
27
29
|
litellm.set_verbose = False
|
|
28
30
|
logger = get_logger("LiteLLMEmbeddingEngine")
|
|
@@ -76,8 +78,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|
|
76
78
|
enable_mocking = str(enable_mocking).lower()
|
|
77
79
|
self.mock = enable_mocking in ("true", "1", "yes")
|
|
78
80
|
|
|
79
|
-
@
|
|
80
|
-
|
|
81
|
+
@retry(
|
|
82
|
+
stop=stop_after_delay(128),
|
|
83
|
+
wait=wait_exponential_jitter(2, 128),
|
|
84
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
85
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
86
|
+
reraise=True,
|
|
87
|
+
)
|
|
81
88
|
async def embed_text(self, text: List[str]) -> List[List[float]]:
|
|
82
89
|
"""
|
|
83
90
|
Embed a list of text strings into vector representations.
|
|
@@ -150,7 +157,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|
|
150
157
|
litellm.exceptions.NotFoundError,
|
|
151
158
|
) as e:
|
|
152
159
|
logger.error(f"Embedding error with model {self.model}: {str(e)}")
|
|
153
|
-
raise EmbeddingException(f"Failed to index data points using model {self.model}")
|
|
160
|
+
raise EmbeddingException(f"Failed to index data points using model {self.model}") from e
|
|
154
161
|
|
|
155
162
|
except Exception as error:
|
|
156
163
|
logger.error("Error embedding text: %s", str(error))
|
|
@@ -3,8 +3,16 @@ from cognee.shared.logging_utils import get_logger
|
|
|
3
3
|
import aiohttp
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
import os
|
|
6
|
-
|
|
6
|
+
import litellm
|
|
7
|
+
import logging
|
|
7
8
|
import aiohttp.http_exceptions
|
|
9
|
+
from tenacity import (
|
|
10
|
+
retry,
|
|
11
|
+
stop_after_delay,
|
|
12
|
+
wait_exponential_jitter,
|
|
13
|
+
retry_if_not_exception_type,
|
|
14
|
+
before_sleep_log,
|
|
15
|
+
)
|
|
8
16
|
|
|
9
17
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
10
18
|
from cognee.infrastructure.llm.tokenizer.HuggingFace import (
|
|
@@ -69,7 +77,6 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
|
|
|
69
77
|
enable_mocking = str(enable_mocking).lower()
|
|
70
78
|
self.mock = enable_mocking in ("true", "1", "yes")
|
|
71
79
|
|
|
72
|
-
@embedding_rate_limit_async
|
|
73
80
|
async def embed_text(self, text: List[str]) -> List[List[float]]:
|
|
74
81
|
"""
|
|
75
82
|
Generate embedding vectors for a list of text prompts.
|
|
@@ -92,7 +99,13 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
|
|
|
92
99
|
embeddings = await asyncio.gather(*[self._get_embedding(prompt) for prompt in text])
|
|
93
100
|
return embeddings
|
|
94
101
|
|
|
95
|
-
@
|
|
102
|
+
@retry(
|
|
103
|
+
stop=stop_after_delay(128),
|
|
104
|
+
wait=wait_exponential_jitter(2, 128),
|
|
105
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
106
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
107
|
+
reraise=True,
|
|
108
|
+
)
|
|
96
109
|
async def _get_embedding(self, prompt: str) -> List[float]:
|
|
97
110
|
"""
|
|
98
111
|
Internal method to call the Ollama embeddings endpoint for a single prompt.
|
|
@@ -111,7 +124,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
|
|
|
111
124
|
self.endpoint, json=payload, headers=headers, timeout=60.0
|
|
112
125
|
) as response:
|
|
113
126
|
data = await response.json()
|
|
114
|
-
return data["
|
|
127
|
+
return data["embeddings"][0]
|
|
115
128
|
|
|
116
129
|
def get_vector_size(self) -> int:
|
|
117
130
|
"""
|
|
@@ -24,11 +24,10 @@ class EmbeddingConfig(BaseSettings):
|
|
|
24
24
|
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
|
25
25
|
|
|
26
26
|
def model_post_init(self, __context) -> None:
|
|
27
|
-
# If embedding batch size is not defined use 2048 as default for OpenAI and 100 for all other embedding models
|
|
28
27
|
if not self.embedding_batch_size and self.embedding_provider.lower() == "openai":
|
|
29
|
-
self.embedding_batch_size =
|
|
28
|
+
self.embedding_batch_size = 36
|
|
30
29
|
elif not self.embedding_batch_size:
|
|
31
|
-
self.embedding_batch_size =
|
|
30
|
+
self.embedding_batch_size = 36
|
|
32
31
|
|
|
33
32
|
def to_dict(self) -> dict:
|
|
34
33
|
"""
|
|
@@ -15,7 +15,7 @@ class CollectionNotFoundError(CogneeValidationError):
|
|
|
15
15
|
self,
|
|
16
16
|
message,
|
|
17
17
|
name: str = "CollectionNotFoundError",
|
|
18
|
-
status_code: int = status.
|
|
18
|
+
status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
|
|
19
19
|
log=True,
|
|
20
20
|
log_level="DEBUG",
|
|
21
21
|
):
|
|
@@ -324,7 +324,6 @@ class LanceDBAdapter(VectorDBInterface):
|
|
|
324
324
|
|
|
325
325
|
def get_data_point_schema(self, model_type: BaseModel):
|
|
326
326
|
related_models_fields = []
|
|
327
|
-
|
|
328
327
|
for field_name, field_config in model_type.model_fields.items():
|
|
329
328
|
if hasattr(field_config, "model_fields"):
|
|
330
329
|
related_models_fields.append(field_name)
|
|
@@ -8,6 +8,6 @@ class FileContentHashingError(Exception):
|
|
|
8
8
|
self,
|
|
9
9
|
message: str = "Failed to hash content of the file.",
|
|
10
10
|
name: str = "FileContentHashingError",
|
|
11
|
-
status_code=status.
|
|
11
|
+
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
|
12
12
|
):
|
|
13
13
|
super().__init__(message, name, status_code)
|
|
@@ -82,16 +82,16 @@ class LocalFileStorage(Storage):
|
|
|
82
82
|
self.ensure_directory_exists(file_dir_path)
|
|
83
83
|
|
|
84
84
|
if overwrite or not os.path.exists(full_file_path):
|
|
85
|
-
|
|
86
|
-
full_file_path,
|
|
87
|
-
mode="w" if isinstance(data, str) else "wb",
|
|
88
|
-
encoding="utf-8" if isinstance(data, str) else None,
|
|
89
|
-
) as file:
|
|
90
|
-
if hasattr(data, "read"):
|
|
91
|
-
data.seek(0)
|
|
92
|
-
file.write(data.read())
|
|
93
|
-
else:
|
|
85
|
+
if isinstance(data, str):
|
|
86
|
+
with open(full_file_path, mode="w", encoding="utf-8", newline="\n") as file:
|
|
94
87
|
file.write(data)
|
|
88
|
+
else:
|
|
89
|
+
with open(full_file_path, mode="wb") as file:
|
|
90
|
+
if hasattr(data, "read"):
|
|
91
|
+
data.seek(0)
|
|
92
|
+
file.write(data.read())
|
|
93
|
+
else:
|
|
94
|
+
file.write(data)
|
|
95
95
|
|
|
96
96
|
file.close()
|
|
97
97
|
|
|
@@ -70,18 +70,18 @@ class S3FileStorage(Storage):
|
|
|
70
70
|
if overwrite or not await self.file_exists(file_path):
|
|
71
71
|
|
|
72
72
|
def save_data_to_file():
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
) as file:
|
|
78
|
-
if hasattr(data, "read"):
|
|
79
|
-
data.seek(0)
|
|
80
|
-
file.write(data.read())
|
|
81
|
-
else:
|
|
73
|
+
if isinstance(data, str):
|
|
74
|
+
with self.s3.open(
|
|
75
|
+
full_file_path, mode="w", encoding="utf-8", newline="\n"
|
|
76
|
+
) as file:
|
|
82
77
|
file.write(data)
|
|
83
|
-
|
|
84
|
-
|
|
78
|
+
else:
|
|
79
|
+
with self.s3.open(full_file_path, mode="wb") as file:
|
|
80
|
+
if hasattr(data, "read"):
|
|
81
|
+
data.seek(0)
|
|
82
|
+
file.write(data.read())
|
|
83
|
+
else:
|
|
84
|
+
file.write(data)
|
|
85
85
|
|
|
86
86
|
await run_async(save_data_to_file)
|
|
87
87
|
|
|
@@ -124,6 +124,12 @@ def guess_file_type(file: BinaryIO) -> filetype.Type:
|
|
|
124
124
|
"""
|
|
125
125
|
file_type = filetype.guess(file)
|
|
126
126
|
|
|
127
|
+
# If file type could not be determined consider it a plain text file as they don't have magic number encoding
|
|
128
|
+
if file_type is None:
|
|
129
|
+
from filetype.types.base import Type
|
|
130
|
+
|
|
131
|
+
file_type = Type("text/plain", "txt")
|
|
132
|
+
|
|
127
133
|
if file_type is None:
|
|
128
134
|
raise FileTypeException(f"Unknown file detected: {file.name}.")
|
|
129
135
|
|
|
@@ -10,8 +10,6 @@ Here are the available `SearchType` tools and their specific functions:
|
|
|
10
10
|
- Summarizing large amounts of information
|
|
11
11
|
- Quick understanding of complex subjects
|
|
12
12
|
|
|
13
|
-
* **`INSIGHTS`**: The `INSIGHTS` search type discovers connections and relationships between entities in the knowledge graph.
|
|
14
|
-
|
|
15
13
|
**Best for:**
|
|
16
14
|
|
|
17
15
|
- Discovering how entities are connected
|
|
@@ -95,9 +93,6 @@ Here are the available `SearchType` tools and their specific functions:
|
|
|
95
93
|
Query: "Summarize the key findings from these research papers"
|
|
96
94
|
Response: `SUMMARIES`
|
|
97
95
|
|
|
98
|
-
Query: "What is the relationship between the methodologies used in these papers?"
|
|
99
|
-
Response: `INSIGHTS`
|
|
100
|
-
|
|
101
96
|
Query: "When was Einstein born?"
|
|
102
97
|
Response: `CHUNKS`
|
|
103
98
|
|
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py
CHANGED
|
@@ -1,19 +1,24 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from typing import Type
|
|
2
3
|
from pydantic import BaseModel
|
|
4
|
+
import litellm
|
|
3
5
|
import instructor
|
|
6
|
+
from cognee.shared.logging_utils import get_logger
|
|
7
|
+
from tenacity import (
|
|
8
|
+
retry,
|
|
9
|
+
stop_after_delay,
|
|
10
|
+
wait_exponential_jitter,
|
|
11
|
+
retry_if_not_exception_type,
|
|
12
|
+
before_sleep_log,
|
|
13
|
+
)
|
|
4
14
|
|
|
5
|
-
from cognee.infrastructure.llm.exceptions import MissingSystemPromptPathError
|
|
6
15
|
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
|
|
7
16
|
LLMInterface,
|
|
8
17
|
)
|
|
9
|
-
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import (
|
|
10
|
-
rate_limit_async,
|
|
11
|
-
sleep_and_retry_async,
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
15
18
|
from cognee.infrastructure.llm.config import get_llm_config
|
|
16
19
|
|
|
20
|
+
logger = get_logger()
|
|
21
|
+
|
|
17
22
|
|
|
18
23
|
class AnthropicAdapter(LLMInterface):
|
|
19
24
|
"""
|
|
@@ -35,8 +40,13 @@ class AnthropicAdapter(LLMInterface):
|
|
|
35
40
|
self.model = model
|
|
36
41
|
self.max_completion_tokens = max_completion_tokens
|
|
37
42
|
|
|
38
|
-
@
|
|
39
|
-
|
|
43
|
+
@retry(
|
|
44
|
+
stop=stop_after_delay(128),
|
|
45
|
+
wait=wait_exponential_jitter(2, 128),
|
|
46
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
47
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
48
|
+
reraise=True,
|
|
49
|
+
)
|
|
40
50
|
async def acreate_structured_output(
|
|
41
51
|
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
|
42
52
|
) -> BaseModel:
|
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py
CHANGED
|
@@ -12,11 +12,18 @@ from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError
|
|
|
12
12
|
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
|
|
13
13
|
LLMInterface,
|
|
14
14
|
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
import logging
|
|
16
|
+
from cognee.shared.logging_utils import get_logger
|
|
17
|
+
from tenacity import (
|
|
18
|
+
retry,
|
|
19
|
+
stop_after_delay,
|
|
20
|
+
wait_exponential_jitter,
|
|
21
|
+
retry_if_not_exception_type,
|
|
22
|
+
before_sleep_log,
|
|
18
23
|
)
|
|
19
24
|
|
|
25
|
+
logger = get_logger()
|
|
26
|
+
|
|
20
27
|
|
|
21
28
|
class GeminiAdapter(LLMInterface):
|
|
22
29
|
"""
|
|
@@ -58,8 +65,13 @@ class GeminiAdapter(LLMInterface):
|
|
|
58
65
|
|
|
59
66
|
self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
|
|
60
67
|
|
|
61
|
-
@
|
|
62
|
-
|
|
68
|
+
@retry(
|
|
69
|
+
stop=stop_after_delay(128),
|
|
70
|
+
wait=wait_exponential_jitter(2, 128),
|
|
71
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
72
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
73
|
+
reraise=True,
|
|
74
|
+
)
|
|
63
75
|
async def acreate_structured_output(
|
|
64
76
|
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
|
65
77
|
) -> BaseModel:
|
|
@@ -12,11 +12,18 @@ from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError
|
|
|
12
12
|
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
|
|
13
13
|
LLMInterface,
|
|
14
14
|
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
import logging
|
|
16
|
+
from cognee.shared.logging_utils import get_logger
|
|
17
|
+
from tenacity import (
|
|
18
|
+
retry,
|
|
19
|
+
stop_after_delay,
|
|
20
|
+
wait_exponential_jitter,
|
|
21
|
+
retry_if_not_exception_type,
|
|
22
|
+
before_sleep_log,
|
|
18
23
|
)
|
|
19
24
|
|
|
25
|
+
logger = get_logger()
|
|
26
|
+
|
|
20
27
|
|
|
21
28
|
class GenericAPIAdapter(LLMInterface):
|
|
22
29
|
"""
|
|
@@ -58,8 +65,13 @@ class GenericAPIAdapter(LLMInterface):
|
|
|
58
65
|
|
|
59
66
|
self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
|
|
60
67
|
|
|
61
|
-
@
|
|
62
|
-
|
|
68
|
+
@retry(
|
|
69
|
+
stop=stop_after_delay(128),
|
|
70
|
+
wait=wait_exponential_jitter(2, 128),
|
|
71
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
72
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
73
|
+
reraise=True,
|
|
74
|
+
)
|
|
63
75
|
async def acreate_structured_output(
|
|
64
76
|
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
|
65
77
|
) -> BaseModel:
|