cognee 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/v1/cloud/routers/get_checks_router.py +1 -1
- cognee/api/v1/cognify/cognify.py +44 -7
- cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
- cognee/api/v1/prune/prune.py +2 -2
- cognee/api/v1/search/search.py +1 -1
- cognee/api/v1/sync/sync.py +16 -5
- cognee/base_config.py +19 -1
- cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +2 -2
- cognee/infrastructure/databases/graph/kuzu/remote_kuzu_adapter.py +4 -1
- cognee/infrastructure/databases/relational/ModelBase.py +2 -1
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +2 -6
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +6 -5
- cognee/infrastructure/files/storage/LocalFileStorage.py +50 -0
- cognee/infrastructure/files/storage/S3FileStorage.py +56 -9
- cognee/infrastructure/files/storage/StorageManager.py +18 -0
- cognee/infrastructure/files/utils/get_file_metadata.py +6 -1
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +4 -2
- cognee/modules/cloud/operations/check_api_key.py +4 -1
- cognee/modules/data/deletion/prune_system.py +5 -1
- cognee/modules/data/methods/create_authorized_dataset.py +9 -0
- cognee/modules/data/methods/get_authorized_dataset.py +1 -1
- cognee/modules/data/methods/get_authorized_dataset_by_name.py +11 -0
- cognee/modules/graph/utils/expand_with_nodes_and_edges.py +22 -8
- cognee/modules/graph/utils/retrieve_existing_edges.py +0 -2
- cognee/modules/notebooks/methods/create_notebook.py +34 -0
- cognee/modules/notebooks/methods/get_notebooks.py +27 -1
- cognee/modules/notebooks/models/Notebook.py +206 -1
- cognee/modules/observability/get_observe.py +14 -0
- cognee/modules/observability/observers.py +1 -0
- cognee/modules/ontology/base_ontology_resolver.py +42 -0
- cognee/modules/ontology/get_default_ontology_resolver.py +41 -0
- cognee/modules/ontology/matching_strategies.py +53 -0
- cognee/modules/ontology/models.py +20 -0
- cognee/modules/ontology/ontology_config.py +24 -0
- cognee/modules/ontology/ontology_env_config.py +45 -0
- cognee/modules/ontology/rdf_xml/{OntologyResolver.py → RDFLibOntologyResolver.py} +20 -28
- cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +13 -0
- cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +1 -1
- cognee/modules/pipelines/models/PipelineRunInfo.py +7 -2
- cognee/modules/retrieval/temporal_retriever.py +2 -2
- cognee/modules/search/methods/get_search_type_tools.py +7 -0
- cognee/modules/search/methods/search.py +12 -13
- cognee/modules/search/utils/prepare_search_result.py +28 -6
- cognee/modules/search/utils/transform_context_to_graph.py +1 -1
- cognee/modules/search/utils/transform_insights_to_graph.py +28 -0
- cognee/modules/users/methods/create_user.py +4 -24
- cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +12 -0
- cognee/modules/users/permissions/methods/check_permission_on_dataset.py +11 -0
- cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +19 -2
- cognee/modules/users/permissions/methods/get_document_ids_for_user.py +10 -0
- cognee/modules/users/permissions/methods/get_principal.py +9 -0
- cognee/modules/users/permissions/methods/get_principal_datasets.py +11 -0
- cognee/modules/users/permissions/methods/get_role.py +10 -0
- cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +3 -3
- cognee/modules/users/permissions/methods/get_tenant.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_role.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_user.py +9 -0
- cognee/modules/users/permissions/methods/give_permission_on_dataset.py +10 -0
- cognee/modules/users/roles/methods/add_user_to_role.py +11 -0
- cognee/modules/users/roles/methods/create_role.py +10 -0
- cognee/modules/users/tenants/methods/add_user_to_tenant.py +12 -0
- cognee/modules/users/tenants/methods/create_tenant.py +10 -0
- cognee/root_dir.py +5 -0
- cognee/shared/cache.py +346 -0
- cognee/shared/utils.py +12 -0
- cognee/tasks/graph/extract_graph_from_data.py +53 -10
- cognee/tasks/graph/extract_graph_from_data_v2.py +16 -4
- cognee/tasks/ingestion/save_data_item_to_storage.py +1 -0
- cognee/tasks/temporal_graph/models.py +11 -6
- cognee/tests/cli_tests/cli_unit_tests/test_cli_main.py +5 -5
- cognee/tests/test_cognee_server_start.py +4 -4
- cognee/tests/test_temporal_graph.py +6 -34
- cognee/tests/unit/modules/ontology/test_ontology_adapter.py +330 -13
- cognee/tests/unit/modules/users/test_tutorial_notebook_creation.py +399 -0
- {cognee-0.3.3.dist-info → cognee-0.3.4.dist-info}/METADATA +11 -8
- {cognee-0.3.3.dist-info → cognee-0.3.4.dist-info}/RECORD +81 -73
- cognee/modules/notebooks/methods/create_tutorial_notebook.py +0 -92
- {cognee-0.3.3.dist-info → cognee-0.3.4.dist-info}/WHEEL +0 -0
- {cognee-0.3.3.dist-info → cognee-0.3.4.dist-info}/entry_points.txt +0 -0
- {cognee-0.3.3.dist-info → cognee-0.3.4.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.3.dist-info → cognee-0.3.4.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -16,6 +16,16 @@ async def create_role(
|
|
|
16
16
|
role_name: str,
|
|
17
17
|
owner_id: UUID,
|
|
18
18
|
):
|
|
19
|
+
"""
|
|
20
|
+
Create a new role with the given name, if the request owner with the given id
|
|
21
|
+
has the necessary permission.
|
|
22
|
+
Args:
|
|
23
|
+
role_name: Name of the new role.
|
|
24
|
+
owner_id: Id of the request owner.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
None
|
|
28
|
+
"""
|
|
19
29
|
db_engine = get_relational_engine()
|
|
20
30
|
async with db_engine.get_async_session() as session:
|
|
21
31
|
user = await get_user(owner_id)
|
|
@@ -13,6 +13,18 @@ from cognee.modules.users.exceptions import (
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
async def add_user_to_tenant(user_id: UUID, tenant_id: UUID, owner_id: UUID):
|
|
16
|
+
"""
|
|
17
|
+
Add a user with the given id to the tenant with the given id.
|
|
18
|
+
This can only be successful if the request owner with the given id is the tenant owner.
|
|
19
|
+
Args:
|
|
20
|
+
user_id: Id of the user.
|
|
21
|
+
tenant_id: Id of the tenant.
|
|
22
|
+
owner_id: Id of the request owner.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
None
|
|
26
|
+
|
|
27
|
+
"""
|
|
16
28
|
db_engine = get_relational_engine()
|
|
17
29
|
async with db_engine.get_async_session() as session:
|
|
18
30
|
user = await get_user(user_id)
|
|
@@ -8,6 +8,16 @@ from cognee.modules.users.methods import get_user
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
async def create_tenant(tenant_name: str, user_id: UUID):
|
|
11
|
+
"""
|
|
12
|
+
Create a new tenant with the given name, for the user with the given id.
|
|
13
|
+
This user is the owner of the tenant.
|
|
14
|
+
Args:
|
|
15
|
+
tenant_name: Name of the new tenant.
|
|
16
|
+
user_id: Id of the user.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
None
|
|
20
|
+
"""
|
|
11
21
|
db_engine = get_relational_engine()
|
|
12
22
|
async with db_engine.get_async_session() as session:
|
|
13
23
|
try:
|
cognee/root_dir.py
CHANGED
|
@@ -20,6 +20,11 @@ def ensure_absolute_path(path: str) -> str:
|
|
|
20
20
|
"""
|
|
21
21
|
if path is None:
|
|
22
22
|
raise ValueError("Path cannot be None")
|
|
23
|
+
|
|
24
|
+
# Check if it's an S3 URL - S3 URLs are absolute by definition
|
|
25
|
+
if path.startswith("s3://"):
|
|
26
|
+
return path
|
|
27
|
+
|
|
23
28
|
path_obj = Path(path).expanduser()
|
|
24
29
|
if path_obj.is_absolute():
|
|
25
30
|
return str(path_obj.resolve())
|
cognee/shared/cache.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Storage-aware cache management utilities for Cognee.
|
|
3
|
+
|
|
4
|
+
This module provides cache functionality that works with both local and cloud storage
|
|
5
|
+
backends (like S3) through the StorageManager abstraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import zipfile
|
|
10
|
+
import asyncio
|
|
11
|
+
from typing import Optional, Tuple
|
|
12
|
+
import aiohttp
|
|
13
|
+
import logging
|
|
14
|
+
from io import BytesIO
|
|
15
|
+
|
|
16
|
+
from cognee.base_config import get_base_config
|
|
17
|
+
from cognee.infrastructure.files.storage.get_file_storage import get_file_storage
|
|
18
|
+
from cognee.infrastructure.files.storage.StorageManager import StorageManager
|
|
19
|
+
from cognee.shared.utils import create_secure_ssl_context
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class StorageAwareCache:
|
|
25
|
+
"""
|
|
26
|
+
A cache manager that works with different storage backends (local, S3, etc.)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, cache_subdir: str = "cache"):
|
|
30
|
+
"""
|
|
31
|
+
Initialize the cache manager.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
cache_subdir: Subdirectory name within the system root for caching
|
|
35
|
+
"""
|
|
36
|
+
self.base_config = get_base_config()
|
|
37
|
+
# Since we're using cache_root_directory, don't add extra cache prefix
|
|
38
|
+
self.cache_base_path = ""
|
|
39
|
+
self.storage_manager: StorageManager = get_file_storage(
|
|
40
|
+
self.base_config.cache_root_directory
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Print absolute path
|
|
44
|
+
storage_path = self.storage_manager.storage.storage_path
|
|
45
|
+
if storage_path.startswith("s3://"):
|
|
46
|
+
absolute_path = storage_path # S3 paths are already absolute
|
|
47
|
+
else:
|
|
48
|
+
import os
|
|
49
|
+
|
|
50
|
+
absolute_path = os.path.abspath(storage_path)
|
|
51
|
+
logger.info(f"Storage manager absolute path: {absolute_path}")
|
|
52
|
+
|
|
53
|
+
async def get_cache_dir(self) -> str:
|
|
54
|
+
"""Get the base cache directory path."""
|
|
55
|
+
cache_path = self.cache_base_path or "." # Use "." for root when cache_base_path is empty
|
|
56
|
+
await self.storage_manager.ensure_directory_exists(cache_path)
|
|
57
|
+
return cache_path
|
|
58
|
+
|
|
59
|
+
async def get_cache_subdir(self, name: str) -> str:
|
|
60
|
+
"""Get a specific cache subdirectory."""
|
|
61
|
+
if self.cache_base_path:
|
|
62
|
+
cache_path = f"{self.cache_base_path}/{name}"
|
|
63
|
+
else:
|
|
64
|
+
cache_path = name
|
|
65
|
+
await self.storage_manager.ensure_directory_exists(cache_path)
|
|
66
|
+
|
|
67
|
+
# Return the absolute path based on storage system
|
|
68
|
+
if self.storage_manager.storage.storage_path.startswith("s3://"):
|
|
69
|
+
return cache_path
|
|
70
|
+
elif hasattr(self.storage_manager.storage, "storage_path"):
|
|
71
|
+
return f"{self.storage_manager.storage.storage_path}/{cache_path}"
|
|
72
|
+
else:
|
|
73
|
+
# Fallback for other storage types
|
|
74
|
+
return cache_path
|
|
75
|
+
|
|
76
|
+
async def delete_cache(self):
|
|
77
|
+
"""Delete the entire cache directory."""
|
|
78
|
+
logger.info("Deleting cache...")
|
|
79
|
+
try:
|
|
80
|
+
await self.storage_manager.remove_all(self.cache_base_path)
|
|
81
|
+
logger.info("✓ Cache deleted successfully!")
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.error(f"Error deleting cache: {e}")
|
|
84
|
+
raise
|
|
85
|
+
|
|
86
|
+
async def _is_cache_valid(self, cache_dir: str, version_or_hash: str) -> bool:
|
|
87
|
+
"""Check if cached content is valid for the given version/hash."""
|
|
88
|
+
version_file = f"{cache_dir}/version.txt"
|
|
89
|
+
|
|
90
|
+
if not await self.storage_manager.file_exists(version_file):
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
async with self.storage_manager.open(version_file, "r") as f:
|
|
95
|
+
cached_version = (await asyncio.to_thread(f.read)).strip()
|
|
96
|
+
return cached_version == version_or_hash
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.debug(f"Error checking cache validity: {e}")
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
async def _clear_cache(self, cache_dir: str) -> None:
|
|
102
|
+
"""Clear a cache directory."""
|
|
103
|
+
try:
|
|
104
|
+
await self.storage_manager.remove_all(cache_dir)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.debug(f"Error clearing cache directory {cache_dir}: {e}")
|
|
107
|
+
|
|
108
|
+
async def _check_remote_content_freshness(
|
|
109
|
+
self, url: str, cache_dir: str
|
|
110
|
+
) -> Tuple[bool, Optional[str]]:
|
|
111
|
+
"""
|
|
112
|
+
Check if remote content is fresher than cached version using HTTP headers.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Tuple of (is_fresh: bool, new_identifier: Optional[str])
|
|
116
|
+
"""
|
|
117
|
+
try:
|
|
118
|
+
# Make a HEAD request to check headers without downloading
|
|
119
|
+
ssl_context = create_secure_ssl_context()
|
|
120
|
+
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
|
121
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
|
122
|
+
async with session.head(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
|
123
|
+
response.raise_for_status()
|
|
124
|
+
|
|
125
|
+
# Try ETag first (most reliable)
|
|
126
|
+
etag = response.headers.get("ETag", "").strip('"')
|
|
127
|
+
last_modified = response.headers.get("Last-Modified", "")
|
|
128
|
+
|
|
129
|
+
# Use ETag if available, otherwise Last-Modified
|
|
130
|
+
remote_identifier = etag if etag else last_modified
|
|
131
|
+
|
|
132
|
+
if not remote_identifier:
|
|
133
|
+
logger.debug("No freshness headers available, cannot check for updates")
|
|
134
|
+
return True, None # Assume fresh if no headers
|
|
135
|
+
|
|
136
|
+
# Check cached identifier
|
|
137
|
+
identifier_file = f"{cache_dir}/content_id.txt"
|
|
138
|
+
if await self.storage_manager.file_exists(identifier_file):
|
|
139
|
+
async with self.storage_manager.open(identifier_file, "r") as f:
|
|
140
|
+
cached_identifier = (await asyncio.to_thread(f.read)).strip()
|
|
141
|
+
if cached_identifier == remote_identifier:
|
|
142
|
+
logger.debug(f"Content is fresh (identifier: {remote_identifier[:20]}...)")
|
|
143
|
+
return True, None
|
|
144
|
+
else:
|
|
145
|
+
logger.info(
|
|
146
|
+
f"Content has changed (old: {cached_identifier[:20]}..., new: {remote_identifier[:20]}...)"
|
|
147
|
+
)
|
|
148
|
+
return False, remote_identifier
|
|
149
|
+
else:
|
|
150
|
+
# No cached identifier, treat as stale
|
|
151
|
+
return False, remote_identifier
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.debug(f"Could not check remote freshness: {e}")
|
|
155
|
+
return True, None # Assume fresh if we can't check
|
|
156
|
+
|
|
157
|
+
async def download_and_extract_zip(
|
|
158
|
+
self, url: str, cache_subdir_name: str, version_or_hash: str, force: bool = False
|
|
159
|
+
) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Download a zip file and extract it to cache directory with content freshness checking.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
url: URL to download zip file from
|
|
165
|
+
cache_subdir_name: Name of the cache subdirectory
|
|
166
|
+
version_or_hash: Version string or content hash for cache validation
|
|
167
|
+
force: If True, re-download even if already cached
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Path to the cached directory
|
|
171
|
+
"""
|
|
172
|
+
cache_dir = await self.get_cache_subdir(cache_subdir_name)
|
|
173
|
+
|
|
174
|
+
# Check if already cached and valid
|
|
175
|
+
if not force and await self._is_cache_valid(cache_dir, version_or_hash):
|
|
176
|
+
# Also check if remote content has changed
|
|
177
|
+
is_fresh, new_identifier = await self._check_remote_content_freshness(url, cache_dir)
|
|
178
|
+
if is_fresh:
|
|
179
|
+
logger.debug(f"Content already cached and fresh for version {version_or_hash}")
|
|
180
|
+
return cache_dir
|
|
181
|
+
else:
|
|
182
|
+
logger.info("Cached content is stale, updating...")
|
|
183
|
+
|
|
184
|
+
# Clear old cache if it exists
|
|
185
|
+
await self._clear_cache(cache_dir)
|
|
186
|
+
|
|
187
|
+
logger.info(f"Downloading content from {url}...")
|
|
188
|
+
|
|
189
|
+
# Download the zip file
|
|
190
|
+
zip_content = BytesIO()
|
|
191
|
+
etag = ""
|
|
192
|
+
last_modified = ""
|
|
193
|
+
ssl_context = create_secure_ssl_context()
|
|
194
|
+
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
|
195
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
|
196
|
+
async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
|
|
197
|
+
response.raise_for_status()
|
|
198
|
+
|
|
199
|
+
# Extract headers before consuming response
|
|
200
|
+
etag = response.headers.get("ETag", "").strip('"')
|
|
201
|
+
last_modified = response.headers.get("Last-Modified", "")
|
|
202
|
+
|
|
203
|
+
# Read the response content
|
|
204
|
+
async for chunk in response.content.iter_chunked(8192):
|
|
205
|
+
zip_content.write(chunk)
|
|
206
|
+
zip_content.seek(0)
|
|
207
|
+
|
|
208
|
+
# Extract the archive
|
|
209
|
+
await self.storage_manager.ensure_directory_exists(cache_dir)
|
|
210
|
+
|
|
211
|
+
# Extract files and store them using StorageManager
|
|
212
|
+
with zipfile.ZipFile(zip_content, "r") as zip_file:
|
|
213
|
+
for file_info in zip_file.infolist():
|
|
214
|
+
if file_info.is_dir():
|
|
215
|
+
# Create directory
|
|
216
|
+
dir_path = f"{cache_dir}/{file_info.filename}"
|
|
217
|
+
await self.storage_manager.ensure_directory_exists(dir_path)
|
|
218
|
+
else:
|
|
219
|
+
# Extract and store file
|
|
220
|
+
file_data = zip_file.read(file_info.filename)
|
|
221
|
+
file_path = f"{cache_dir}/{file_info.filename}"
|
|
222
|
+
await self.storage_manager.store(file_path, BytesIO(file_data), overwrite=True)
|
|
223
|
+
|
|
224
|
+
# Write version info for future cache validation
|
|
225
|
+
version_file = f"{cache_dir}/version.txt"
|
|
226
|
+
await self.storage_manager.store(version_file, version_or_hash, overwrite=True)
|
|
227
|
+
|
|
228
|
+
# Store content identifier from response headers for freshness checking
|
|
229
|
+
content_identifier = etag if etag else last_modified
|
|
230
|
+
|
|
231
|
+
if content_identifier:
|
|
232
|
+
identifier_file = f"{cache_dir}/content_id.txt"
|
|
233
|
+
await self.storage_manager.store(identifier_file, content_identifier, overwrite=True)
|
|
234
|
+
logger.debug(f"Stored content identifier: {content_identifier[:20]}...")
|
|
235
|
+
|
|
236
|
+
logger.info("✓ Content downloaded and cached successfully!")
|
|
237
|
+
return cache_dir
|
|
238
|
+
|
|
239
|
+
async def file_exists(self, file_path: str) -> bool:
|
|
240
|
+
"""Check if a file exists in cache storage."""
|
|
241
|
+
return await self.storage_manager.file_exists(file_path)
|
|
242
|
+
|
|
243
|
+
async def read_file(self, file_path: str, encoding: str = "utf-8"):
|
|
244
|
+
"""Read a file from cache storage."""
|
|
245
|
+
return self.storage_manager.open(file_path, encoding=encoding)
|
|
246
|
+
|
|
247
|
+
async def list_files(self, directory_path: str):
|
|
248
|
+
"""List files in a cache directory."""
|
|
249
|
+
try:
|
|
250
|
+
file_list = await self.storage_manager.list_files(directory_path)
|
|
251
|
+
|
|
252
|
+
# For S3 storage, convert relative paths to full S3 URLs
|
|
253
|
+
if self.storage_manager.storage.storage_path.startswith("s3://"):
|
|
254
|
+
full_paths = []
|
|
255
|
+
for file_path in file_list:
|
|
256
|
+
full_s3_path = f"{self.storage_manager.storage.storage_path}/{file_path}"
|
|
257
|
+
full_paths.append(full_s3_path)
|
|
258
|
+
return full_paths
|
|
259
|
+
else:
|
|
260
|
+
# For local storage, return absolute paths
|
|
261
|
+
storage_path = self.storage_manager.storage.storage_path
|
|
262
|
+
if not storage_path.startswith("/"):
|
|
263
|
+
import os
|
|
264
|
+
|
|
265
|
+
storage_path = os.path.abspath(storage_path)
|
|
266
|
+
|
|
267
|
+
full_paths = []
|
|
268
|
+
for file_path in file_list:
|
|
269
|
+
if file_path.startswith("/"):
|
|
270
|
+
full_paths.append(file_path) # Already absolute
|
|
271
|
+
else:
|
|
272
|
+
full_paths.append(f"{storage_path}/{file_path}")
|
|
273
|
+
return full_paths
|
|
274
|
+
|
|
275
|
+
except Exception as e:
|
|
276
|
+
logger.debug(f"Error listing files in {directory_path}: {e}")
|
|
277
|
+
return []
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# Convenience functions that maintain API compatibility
|
|
281
|
+
_cache_manager = None
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def get_cache_manager() -> StorageAwareCache:
|
|
285
|
+
"""Get a singleton cache manager instance."""
|
|
286
|
+
global _cache_manager
|
|
287
|
+
if _cache_manager is None:
|
|
288
|
+
_cache_manager = StorageAwareCache()
|
|
289
|
+
return _cache_manager
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def generate_content_hash(url: str, additional_data: str = "") -> str:
|
|
293
|
+
"""Generate a content hash from URL and optional additional data."""
|
|
294
|
+
content = f"{url}:{additional_data}"
|
|
295
|
+
return hashlib.md5(content.encode()).hexdigest()[:12] # Short hash for readability
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# Async wrapper functions for backward compatibility
|
|
299
|
+
async def delete_cache():
|
|
300
|
+
"""Delete the Cognee cache directory."""
|
|
301
|
+
cache_manager = get_cache_manager()
|
|
302
|
+
await cache_manager.delete_cache()
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
async def get_cognee_cache_dir() -> str:
|
|
306
|
+
"""Get the base Cognee cache directory."""
|
|
307
|
+
cache_manager = get_cache_manager()
|
|
308
|
+
return await cache_manager.get_cache_dir()
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
async def get_cache_subdir(name: str) -> str:
|
|
312
|
+
"""Get a specific cache subdirectory."""
|
|
313
|
+
cache_manager = get_cache_manager()
|
|
314
|
+
return await cache_manager.get_cache_subdir(name)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
async def download_and_extract_zip(
|
|
318
|
+
url: str, cache_dir_name: str, version_or_hash: str, force: bool = False
|
|
319
|
+
) -> str:
|
|
320
|
+
"""Download a zip file and extract it to cache directory."""
|
|
321
|
+
cache_manager = get_cache_manager()
|
|
322
|
+
return await cache_manager.download_and_extract_zip(url, cache_dir_name, version_or_hash, force)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
async def get_tutorial_data_dir() -> str:
|
|
326
|
+
"""Get the tutorial data cache directory."""
|
|
327
|
+
return await get_cache_subdir("tutorial_data")
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
# Cache file operations
|
|
331
|
+
async def cache_file_exists(file_path: str) -> bool:
|
|
332
|
+
"""Check if a file exists in cache storage."""
|
|
333
|
+
cache_manager = get_cache_manager()
|
|
334
|
+
return await cache_manager.file_exists(file_path)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
async def read_cache_file(file_path: str, encoding: str = "utf-8"):
|
|
338
|
+
"""Read a file from cache storage."""
|
|
339
|
+
cache_manager = get_cache_manager()
|
|
340
|
+
return await cache_manager.read_file(file_path, encoding)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
async def list_cache_files(directory_path: str):
|
|
344
|
+
"""List files in a cache directory."""
|
|
345
|
+
cache_manager = get_cache_manager()
|
|
346
|
+
return await cache_manager.list_files(directory_path)
|
cognee/shared/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""This module contains utility functions for the cognee."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import ssl
|
|
4
5
|
import requests
|
|
5
6
|
from datetime import datetime, timezone
|
|
6
7
|
import matplotlib.pyplot as plt
|
|
@@ -18,6 +19,17 @@ from cognee.infrastructure.databases.graph import get_graph_engine
|
|
|
18
19
|
proxy_url = "https://test.prometh.ai"
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
def create_secure_ssl_context() -> ssl.SSLContext:
|
|
23
|
+
"""
|
|
24
|
+
Create a secure SSL context.
|
|
25
|
+
|
|
26
|
+
By default, use the system's certificate store.
|
|
27
|
+
If users report SSL issues, I'm keeping this open in case we need to switch to:
|
|
28
|
+
ssl.create_default_context(cafile=certifi.where())
|
|
29
|
+
"""
|
|
30
|
+
return ssl.create_default_context()
|
|
31
|
+
|
|
32
|
+
|
|
21
33
|
def get_entities(tagged_tokens):
|
|
22
34
|
import nltk
|
|
23
35
|
|
|
@@ -3,8 +3,14 @@ from typing import Type, List, Optional
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
5
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
6
|
+
from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
|
|
6
7
|
from cognee.tasks.storage.add_data_points import add_data_points
|
|
7
|
-
from cognee.modules.ontology.
|
|
8
|
+
from cognee.modules.ontology.ontology_config import Config
|
|
9
|
+
from cognee.modules.ontology.get_default_ontology_resolver import (
|
|
10
|
+
get_default_ontology_resolver,
|
|
11
|
+
get_ontology_resolver_from_env,
|
|
12
|
+
)
|
|
13
|
+
from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
|
|
8
14
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
9
15
|
from cognee.modules.graph.utils import (
|
|
10
16
|
expand_with_nodes_and_edges,
|
|
@@ -24,9 +30,28 @@ async def integrate_chunk_graphs(
|
|
|
24
30
|
data_chunks: list[DocumentChunk],
|
|
25
31
|
chunk_graphs: list,
|
|
26
32
|
graph_model: Type[BaseModel],
|
|
27
|
-
|
|
33
|
+
ontology_resolver: BaseOntologyResolver,
|
|
28
34
|
) -> List[DocumentChunk]:
|
|
29
|
-
"""
|
|
35
|
+
"""Integrate chunk graphs with ontology validation and store in databases.
|
|
36
|
+
|
|
37
|
+
This function processes document chunks and their associated knowledge graphs,
|
|
38
|
+
validates entities against an ontology resolver, and stores the integrated
|
|
39
|
+
data points and edges in the configured databases.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
data_chunks: List of document chunks containing source data
|
|
43
|
+
chunk_graphs: List of knowledge graphs corresponding to each chunk
|
|
44
|
+
graph_model: Pydantic model class for graph data validation
|
|
45
|
+
ontology_resolver: Resolver for validating entities against ontology
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of updated DocumentChunk objects with integrated data
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
InvalidChunkGraphInputError: If input validation fails
|
|
52
|
+
InvalidGraphModelError: If graph model validation fails
|
|
53
|
+
InvalidOntologyAdapterError: If ontology resolver validation fails
|
|
54
|
+
"""
|
|
30
55
|
|
|
31
56
|
if not isinstance(data_chunks, list) or not isinstance(chunk_graphs, list):
|
|
32
57
|
raise InvalidChunkGraphInputError("data_chunks and chunk_graphs must be lists.")
|
|
@@ -36,9 +61,9 @@ async def integrate_chunk_graphs(
|
|
|
36
61
|
)
|
|
37
62
|
if not isinstance(graph_model, type) or not issubclass(graph_model, BaseModel):
|
|
38
63
|
raise InvalidGraphModelError(graph_model)
|
|
39
|
-
if
|
|
64
|
+
if ontology_resolver is None or not hasattr(ontology_resolver, "get_subgraph"):
|
|
40
65
|
raise InvalidOntologyAdapterError(
|
|
41
|
-
type(
|
|
66
|
+
type(ontology_resolver).__name__ if ontology_resolver else "None"
|
|
42
67
|
)
|
|
43
68
|
|
|
44
69
|
graph_engine = await get_graph_engine()
|
|
@@ -55,7 +80,7 @@ async def integrate_chunk_graphs(
|
|
|
55
80
|
)
|
|
56
81
|
|
|
57
82
|
graph_nodes, graph_edges = expand_with_nodes_and_edges(
|
|
58
|
-
data_chunks, chunk_graphs,
|
|
83
|
+
data_chunks, chunk_graphs, ontology_resolver, existing_edges_map
|
|
59
84
|
)
|
|
60
85
|
|
|
61
86
|
if len(graph_nodes) > 0:
|
|
@@ -70,7 +95,7 @@ async def integrate_chunk_graphs(
|
|
|
70
95
|
async def extract_graph_from_data(
|
|
71
96
|
data_chunks: List[DocumentChunk],
|
|
72
97
|
graph_model: Type[BaseModel],
|
|
73
|
-
|
|
98
|
+
config: Config = None,
|
|
74
99
|
custom_prompt: Optional[str] = None,
|
|
75
100
|
) -> List[DocumentChunk]:
|
|
76
101
|
"""
|
|
@@ -101,6 +126,24 @@ async def extract_graph_from_data(
|
|
|
101
126
|
if edge.source_node_id in valid_node_ids and edge.target_node_id in valid_node_ids
|
|
102
127
|
]
|
|
103
128
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
129
|
+
# Extract resolver from config if provided, otherwise get default
|
|
130
|
+
if config is None:
|
|
131
|
+
ontology_config = get_ontology_env_config()
|
|
132
|
+
if (
|
|
133
|
+
ontology_config.ontology_file_path
|
|
134
|
+
and ontology_config.ontology_resolver
|
|
135
|
+
and ontology_config.matching_strategy
|
|
136
|
+
):
|
|
137
|
+
config: Config = {
|
|
138
|
+
"ontology_config": {
|
|
139
|
+
"ontology_resolver": get_ontology_resolver_from_env(**ontology_config.to_dict())
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
else:
|
|
143
|
+
config: Config = {
|
|
144
|
+
"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
ontology_resolver = config["ontology_config"]["ontology_resolver"]
|
|
148
|
+
|
|
149
|
+
return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model, ontology_resolver)
|
|
@@ -3,7 +3,7 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
5
5
|
from cognee.shared.data_models import KnowledgeGraph
|
|
6
|
-
from cognee.modules.ontology.
|
|
6
|
+
from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
|
|
7
7
|
from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes
|
|
8
8
|
from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import (
|
|
9
9
|
extract_content_nodes_and_relationship_names,
|
|
@@ -17,9 +17,21 @@ from cognee.tasks.graph.extract_graph_from_data import integrate_chunk_graphs
|
|
|
17
17
|
async def extract_graph_from_data(
|
|
18
18
|
data_chunks: List[DocumentChunk],
|
|
19
19
|
n_rounds: int = 2,
|
|
20
|
-
ontology_adapter:
|
|
20
|
+
ontology_adapter: BaseOntologyResolver = None,
|
|
21
21
|
) -> List[DocumentChunk]:
|
|
22
|
-
"""Extract and update graph data from document chunks
|
|
22
|
+
"""Extract and update graph data from document chunks using cascade extraction.
|
|
23
|
+
|
|
24
|
+
This function performs multi-step graph extraction from document chunks,
|
|
25
|
+
using cascade extraction techniques to build comprehensive knowledge graphs.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
data_chunks: List of document chunks to process
|
|
29
|
+
n_rounds: Number of extraction rounds to perform (default: 2)
|
|
30
|
+
ontology_adapter: Resolver for validating entities against ontology
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of updated DocumentChunk objects with extracted graph data
|
|
34
|
+
"""
|
|
23
35
|
chunk_nodes = await asyncio.gather(
|
|
24
36
|
*[extract_nodes(chunk.text, n_rounds) for chunk in data_chunks]
|
|
25
37
|
)
|
|
@@ -44,5 +56,5 @@ async def extract_graph_from_data(
|
|
|
44
56
|
data_chunks=data_chunks,
|
|
45
57
|
chunk_graphs=chunk_graphs,
|
|
46
58
|
graph_model=KnowledgeGraph,
|
|
47
|
-
ontology_adapter=ontology_adapter
|
|
59
|
+
ontology_adapter=ontology_adapter,
|
|
48
60
|
)
|
|
@@ -41,6 +41,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|
|
41
41
|
abs_path.is_file()
|
|
42
42
|
except (OSError, ValueError):
|
|
43
43
|
# In case file path is too long it's most likely not a relative path
|
|
44
|
+
abs_path = data_item
|
|
44
45
|
logger.debug(f"Data item was too long to be a possible file path: {abs_path}")
|
|
45
46
|
abs_path = Path("")
|
|
46
47
|
|
|
@@ -3,12 +3,17 @@ from pydantic import BaseModel, Field
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class Timestamp(BaseModel):
|
|
6
|
-
year: int = Field(
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
6
|
+
year: int = Field(
|
|
7
|
+
...,
|
|
8
|
+
ge=1,
|
|
9
|
+
le=9999,
|
|
10
|
+
description="Always required. If only a year is known, use it.",
|
|
11
|
+
)
|
|
12
|
+
month: int = Field(1, ge=1, le=12, description="If unknown, default to 1")
|
|
13
|
+
day: int = Field(1, ge=1, le=31, description="If unknown, default to 1")
|
|
14
|
+
hour: int = Field(0, ge=0, le=23, description="If unknown, default to 0")
|
|
15
|
+
minute: int = Field(0, ge=0, le=59, description="If unknown, default to 0")
|
|
16
|
+
second: int = Field(0, ge=0, le=59, description="If unknown, default to 0")
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
class Interval(BaseModel):
|
|
@@ -49,7 +49,7 @@ class TestCliMain:
|
|
|
49
49
|
def test_main_no_command(self, mock_create_parser):
|
|
50
50
|
"""Test main function when no command is provided"""
|
|
51
51
|
mock_parser = MagicMock()
|
|
52
|
-
mock_parser.parse_args.return_value = MagicMock(command=None)
|
|
52
|
+
mock_parser.parse_args.return_value = MagicMock(command=None, spec={})
|
|
53
53
|
mock_create_parser.return_value = (mock_parser, {})
|
|
54
54
|
|
|
55
55
|
result = main()
|
|
@@ -64,7 +64,7 @@ class TestCliMain:
|
|
|
64
64
|
mock_command.execute.return_value = None
|
|
65
65
|
|
|
66
66
|
mock_parser = MagicMock()
|
|
67
|
-
mock_args = MagicMock(command="test")
|
|
67
|
+
mock_args = MagicMock(command="test", spec={})
|
|
68
68
|
mock_parser.parse_args.return_value = mock_args
|
|
69
69
|
|
|
70
70
|
mock_create_parser.return_value = (mock_parser, {"test": mock_command})
|
|
@@ -84,7 +84,7 @@ class TestCliMain:
|
|
|
84
84
|
mock_command.execute.side_effect = CliCommandException("Test error", error_code=2)
|
|
85
85
|
|
|
86
86
|
mock_parser = MagicMock()
|
|
87
|
-
mock_args = MagicMock(command="test")
|
|
87
|
+
mock_args = MagicMock(command="test", spec={})
|
|
88
88
|
mock_parser.parse_args.return_value = mock_args
|
|
89
89
|
|
|
90
90
|
mock_create_parser.return_value = (mock_parser, {"test": mock_command})
|
|
@@ -103,7 +103,7 @@ class TestCliMain:
|
|
|
103
103
|
mock_command.execute.side_effect = Exception("Generic error")
|
|
104
104
|
|
|
105
105
|
mock_parser = MagicMock()
|
|
106
|
-
mock_args = MagicMock(command="test")
|
|
106
|
+
mock_args = MagicMock(command="test", spec={})
|
|
107
107
|
mock_parser.parse_args.return_value = mock_args
|
|
108
108
|
|
|
109
109
|
mock_create_parser.return_value = (mock_parser, {"test": mock_command})
|
|
@@ -126,7 +126,7 @@ class TestCliMain:
|
|
|
126
126
|
mock_command.execute.side_effect = test_exception
|
|
127
127
|
|
|
128
128
|
mock_parser = MagicMock()
|
|
129
|
-
mock_args = MagicMock(command="test")
|
|
129
|
+
mock_args = MagicMock(command="test", spec={})
|
|
130
130
|
mock_parser.parse_args.return_value = mock_args
|
|
131
131
|
|
|
132
132
|
mock_create_parser.return_value = (mock_parser, {"test": mock_command})
|