cognee 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. cognee/api/v1/cloud/routers/get_checks_router.py +1 -1
  2. cognee/api/v1/cognify/cognify.py +44 -7
  3. cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
  4. cognee/api/v1/notebooks/routers/get_notebooks_router.py +2 -1
  5. cognee/api/v1/prune/prune.py +2 -2
  6. cognee/api/v1/search/search.py +1 -1
  7. cognee/api/v1/sync/sync.py +16 -5
  8. cognee/base_config.py +19 -1
  9. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +2 -2
  10. cognee/infrastructure/databases/graph/kuzu/remote_kuzu_adapter.py +4 -1
  11. cognee/infrastructure/databases/relational/ModelBase.py +2 -1
  12. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +2 -2
  13. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +2 -6
  14. cognee/infrastructure/databases/vector/config.py +1 -1
  15. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +6 -5
  16. cognee/infrastructure/files/storage/LocalFileStorage.py +50 -0
  17. cognee/infrastructure/files/storage/S3FileStorage.py +56 -9
  18. cognee/infrastructure/files/storage/StorageManager.py +18 -0
  19. cognee/infrastructure/files/utils/get_file_metadata.py +6 -1
  20. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +4 -2
  21. cognee/infrastructure/utils/run_async.py +9 -4
  22. cognee/infrastructure/utils/run_sync.py +4 -3
  23. cognee/modules/cloud/operations/check_api_key.py +4 -1
  24. cognee/modules/data/deletion/prune_system.py +5 -1
  25. cognee/modules/data/methods/create_authorized_dataset.py +9 -0
  26. cognee/modules/data/methods/get_authorized_dataset.py +1 -1
  27. cognee/modules/data/methods/get_authorized_dataset_by_name.py +11 -0
  28. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +22 -8
  29. cognee/modules/graph/utils/retrieve_existing_edges.py +0 -2
  30. cognee/modules/notebooks/methods/create_notebook.py +34 -0
  31. cognee/modules/notebooks/methods/get_notebook.py +2 -2
  32. cognee/modules/notebooks/methods/get_notebooks.py +27 -1
  33. cognee/modules/notebooks/methods/update_notebook.py +0 -1
  34. cognee/modules/notebooks/models/Notebook.py +206 -1
  35. cognee/modules/notebooks/operations/run_in_local_sandbox.py +8 -5
  36. cognee/modules/observability/get_observe.py +14 -0
  37. cognee/modules/observability/observers.py +1 -0
  38. cognee/modules/ontology/base_ontology_resolver.py +42 -0
  39. cognee/modules/ontology/get_default_ontology_resolver.py +41 -0
  40. cognee/modules/ontology/matching_strategies.py +53 -0
  41. cognee/modules/ontology/models.py +20 -0
  42. cognee/modules/ontology/ontology_config.py +24 -0
  43. cognee/modules/ontology/ontology_env_config.py +45 -0
  44. cognee/modules/ontology/rdf_xml/{OntologyResolver.py → RDFLibOntologyResolver.py} +20 -28
  45. cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +13 -0
  46. cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +1 -1
  47. cognee/modules/pipelines/models/PipelineRunInfo.py +7 -2
  48. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +1 -1
  49. cognee/modules/retrieval/graph_completion_cot_retriever.py +1 -1
  50. cognee/modules/retrieval/graph_completion_retriever.py +1 -1
  51. cognee/modules/retrieval/temporal_retriever.py +3 -3
  52. cognee/modules/retrieval/user_qa_feedback.py +1 -1
  53. cognee/modules/search/methods/get_search_type_tools.py +7 -0
  54. cognee/modules/search/methods/search.py +12 -13
  55. cognee/modules/search/utils/prepare_search_result.py +31 -9
  56. cognee/modules/search/utils/transform_context_to_graph.py +1 -1
  57. cognee/modules/search/utils/transform_insights_to_graph.py +28 -0
  58. cognee/modules/users/methods/create_user.py +4 -24
  59. cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +12 -0
  60. cognee/modules/users/permissions/methods/check_permission_on_dataset.py +11 -0
  61. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +19 -2
  62. cognee/modules/users/permissions/methods/get_document_ids_for_user.py +10 -0
  63. cognee/modules/users/permissions/methods/get_principal.py +9 -0
  64. cognee/modules/users/permissions/methods/get_principal_datasets.py +11 -0
  65. cognee/modules/users/permissions/methods/get_role.py +10 -0
  66. cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +3 -3
  67. cognee/modules/users/permissions/methods/get_tenant.py +9 -0
  68. cognee/modules/users/permissions/methods/give_default_permission_to_role.py +9 -0
  69. cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +9 -0
  70. cognee/modules/users/permissions/methods/give_default_permission_to_user.py +9 -0
  71. cognee/modules/users/permissions/methods/give_permission_on_dataset.py +10 -0
  72. cognee/modules/users/roles/methods/add_user_to_role.py +11 -0
  73. cognee/modules/users/roles/methods/create_role.py +10 -0
  74. cognee/modules/users/tenants/methods/add_user_to_tenant.py +12 -0
  75. cognee/modules/users/tenants/methods/create_tenant.py +10 -0
  76. cognee/root_dir.py +5 -0
  77. cognee/shared/cache.py +346 -0
  78. cognee/shared/utils.py +12 -0
  79. cognee/tasks/graph/extract_graph_from_data.py +53 -10
  80. cognee/tasks/graph/extract_graph_from_data_v2.py +16 -4
  81. cognee/tasks/ingestion/save_data_item_to_storage.py +1 -0
  82. cognee/tasks/temporal_graph/models.py +11 -6
  83. cognee/tests/cli_tests/cli_unit_tests/test_cli_main.py +5 -5
  84. cognee/tests/test_cognee_server_start.py +4 -4
  85. cognee/tests/test_temporal_graph.py +6 -34
  86. cognee/tests/unit/modules/ontology/test_ontology_adapter.py +330 -13
  87. cognee/tests/unit/modules/users/test_tutorial_notebook_creation.py +399 -0
  88. {cognee-0.3.2.dist-info → cognee-0.3.4.dist-info}/METADATA +11 -8
  89. {cognee-0.3.2.dist-info → cognee-0.3.4.dist-info}/RECORD +93 -86
  90. cognee-0.3.4.dist-info/entry_points.txt +2 -0
  91. cognee/api/v1/save/save.py +0 -335
  92. cognee/tests/test_save_export_path.py +0 -116
  93. cognee-0.3.2.dist-info/entry_points.txt +0 -2
  94. {cognee-0.3.2.dist-info → cognee-0.3.4.dist-info}/WHEEL +0 -0
  95. {cognee-0.3.2.dist-info → cognee-0.3.4.dist-info}/licenses/LICENSE +0 -0
  96. {cognee-0.3.2.dist-info → cognee-0.3.4.dist-info}/licenses/NOTICE.md +0 -0
@@ -8,6 +8,16 @@ from ...models import ACL, Permission
8
8
 
9
9
 
10
10
  async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -> list[str]:
11
+ """
12
+ Return a list of documents ids for which the user has read permission.
13
+ If datasets are specified, return only documents from those datasets.
14
+ Args:
15
+ user_id: Id of the user
16
+ datasets: List of datasets
17
+
18
+ Returns:
19
+ list[str]: List of documents for which the user has read permission
20
+ """
11
21
  db_engine = get_relational_engine()
12
22
 
13
23
  async with db_engine.get_async_session() as session:
@@ -6,6 +6,15 @@ from ...models.Principal import Principal
6
6
 
7
7
 
8
8
  async def get_principal(principal_id: UUID):
9
+ """
10
+ Return information about a user based on their id
11
+ Args:
12
+ principal_id: Id of the user
13
+
14
+ Returns:
15
+ principal: Information about the user (principal)
16
+
17
+ """
9
18
  db_engine = get_relational_engine()
10
19
 
11
20
  async with db_engine.get_async_session() as session:
@@ -9,6 +9,17 @@ from ...models.ACL import ACL
9
9
 
10
10
 
11
11
  async def get_principal_datasets(principal: Principal, permission_type: str) -> list[Dataset]:
12
+ """
13
+ Return a list of datasets for which the user (principal) has a certain permission.
14
+ Args:
15
+ principal: Information about the user
16
+ permission_type: Type of permission
17
+
18
+ Returns:
19
+ list[Dataset]: List of datasets for which the user (principal)
20
+ has the permission (permission_type).
21
+
22
+ """
12
23
  db_engine = get_relational_engine()
13
24
 
14
25
  async with db_engine.get_async_session() as session:
@@ -9,6 +9,16 @@ from ...models.Role import Role
9
9
 
10
10
 
11
11
  async def get_role(tenant_id: UUID, role_name: str):
12
+ """
13
+ Return the role with the name role_name of the given tenant.
14
+ Args:
15
+ tenant_id: Id of the given tenant
16
+ role_name: Name of the role
17
+
18
+ Returns
19
+ The role for the given tenant.
20
+
21
+ """
12
22
  db_engine = get_relational_engine()
13
23
 
14
24
  async with db_engine.get_async_session() as session:
@@ -15,9 +15,9 @@ async def get_specific_user_permission_datasets(
15
15
  Return a list of datasets user has given permission for. If a list of datasets is provided,
16
16
  verify for which datasets user has appropriate permission for and return list of datasets he has permission for.
17
17
  Args:
18
- user_id:
19
- permission_type:
20
- dataset_ids:
18
+ user_id: Id of the user.
19
+ permission_type: Type of the permission.
20
+ dataset_ids: Ids of the provided datasets
21
21
 
22
22
  Returns:
23
23
  list[Dataset]: List of datasets user has permission for
@@ -8,6 +8,15 @@ from ...models.Tenant import Tenant
8
8
 
9
9
 
10
10
  async def get_tenant(tenant_id: UUID):
11
+ """
12
+ Return information about the tenant based on the given id.
13
+ Args:
14
+ tenant_id: Id of the given tenant
15
+
16
+ Returns
17
+ Information about the given tenant.
18
+
19
+ """
11
20
  db_engine = get_relational_engine()
12
21
 
13
22
  async with db_engine.get_async_session() as session:
@@ -16,6 +16,15 @@ from cognee.modules.users.models import (
16
16
 
17
17
 
18
18
  async def give_default_permission_to_role(role_id: UUID, permission_name: str):
19
+ """
20
+ Give the permission with given name to the role with the given id as a default permission.
21
+ Args:
22
+ role_id: Id of the role
23
+ permission_name: Name of the permission
24
+
25
+ Returns:
26
+ None
27
+ """
19
28
  db_engine = get_relational_engine()
20
29
 
21
30
  async with db_engine.get_async_session() as session:
@@ -16,6 +16,15 @@ from cognee.modules.users.models import (
16
16
 
17
17
 
18
18
  async def give_default_permission_to_tenant(tenant_id: UUID, permission_name: str):
19
+ """
20
+ Give the permission with given name to the tenant with the given id as a default permission.
21
+ Args:
22
+ tenant_id: Id of the tenant
23
+ permission_name: Name of the permission
24
+
25
+ Returns:
26
+ None
27
+ """
19
28
  db_engine = get_relational_engine()
20
29
  async with db_engine.get_async_session() as session:
21
30
  tenant = (
@@ -16,6 +16,15 @@ from cognee.modules.users.models import (
16
16
 
17
17
 
18
18
  async def give_default_permission_to_user(user_id: UUID, permission_name: str):
19
+ """
20
+ Give the permission with given name to the user with the given id as a default permission.
21
+ Args:
22
+ user_id: Id of the tenant
23
+ permission_name: Name of the permission
24
+
25
+ Returns:
26
+ None
27
+ """
19
28
  db_engine = get_relational_engine()
20
29
  async with db_engine.get_async_session() as session:
21
30
  user = (await session.execute(select(User).where(User.id == user_id))).scalars().first()
@@ -24,6 +24,16 @@ async def give_permission_on_dataset(
24
24
  dataset_id: UUID,
25
25
  permission_name: str,
26
26
  ):
27
+ """
28
+ Give a specific permission on a dataset to a user.
29
+ Args:
30
+ principal: User who is being given the permission on the dataset
31
+ dataset_id: Id of the dataset
32
+ permission_name: Name of permission to give
33
+
34
+ Returns:
35
+ None
36
+ """
27
37
  db_engine = get_relational_engine()
28
38
 
29
39
  async with db_engine.get_async_session() as session:
@@ -21,6 +21,17 @@ from cognee.modules.users.models import (
21
21
 
22
22
 
23
23
  async def add_user_to_role(user_id: UUID, role_id: UUID, owner_id: UUID):
24
+ """
25
+ Add a user with the given id to the role with the given id.
26
+ Args:
27
+ user_id: Id of the user.
28
+ role_id: Id of the role.
29
+ owner_id: Id of the request owner.
30
+
31
+ Returns:
32
+ None
33
+
34
+ """
24
35
  db_engine = get_relational_engine()
25
36
  async with db_engine.get_async_session() as session:
26
37
  user = (await session.execute(select(User).where(User.id == user_id))).scalars().first()
@@ -16,6 +16,16 @@ async def create_role(
16
16
  role_name: str,
17
17
  owner_id: UUID,
18
18
  ):
19
+ """
20
+ Create a new role with the given name, if the request owner with the given id
21
+ has the necessary permission.
22
+ Args:
23
+ role_name: Name of the new role.
24
+ owner_id: Id of the request owner.
25
+
26
+ Returns:
27
+ None
28
+ """
19
29
  db_engine = get_relational_engine()
20
30
  async with db_engine.get_async_session() as session:
21
31
  user = await get_user(owner_id)
@@ -13,6 +13,18 @@ from cognee.modules.users.exceptions import (
13
13
 
14
14
 
15
15
  async def add_user_to_tenant(user_id: UUID, tenant_id: UUID, owner_id: UUID):
16
+ """
17
+ Add a user with the given id to the tenant with the given id.
18
+ This can only be successful if the request owner with the given id is the tenant owner.
19
+ Args:
20
+ user_id: Id of the user.
21
+ tenant_id: Id of the tenant.
22
+ owner_id: Id of the request owner.
23
+
24
+ Returns:
25
+ None
26
+
27
+ """
16
28
  db_engine = get_relational_engine()
17
29
  async with db_engine.get_async_session() as session:
18
30
  user = await get_user(user_id)
@@ -8,6 +8,16 @@ from cognee.modules.users.methods import get_user
8
8
 
9
9
 
10
10
  async def create_tenant(tenant_name: str, user_id: UUID):
11
+ """
12
+ Create a new tenant with the given name, for the user with the given id.
13
+ This user is the owner of the tenant.
14
+ Args:
15
+ tenant_name: Name of the new tenant.
16
+ user_id: Id of the user.
17
+
18
+ Returns:
19
+ None
20
+ """
11
21
  db_engine = get_relational_engine()
12
22
  async with db_engine.get_async_session() as session:
13
23
  try:
cognee/root_dir.py CHANGED
@@ -20,6 +20,11 @@ def ensure_absolute_path(path: str) -> str:
20
20
  """
21
21
  if path is None:
22
22
  raise ValueError("Path cannot be None")
23
+
24
+ # Check if it's an S3 URL - S3 URLs are absolute by definition
25
+ if path.startswith("s3://"):
26
+ return path
27
+
23
28
  path_obj = Path(path).expanduser()
24
29
  if path_obj.is_absolute():
25
30
  return str(path_obj.resolve())
cognee/shared/cache.py ADDED
@@ -0,0 +1,346 @@
1
+ """
2
+ Storage-aware cache management utilities for Cognee.
3
+
4
+ This module provides cache functionality that works with both local and cloud storage
5
+ backends (like S3) through the StorageManager abstraction.
6
+ """
7
+
8
+ import hashlib
9
+ import zipfile
10
+ import asyncio
11
+ from typing import Optional, Tuple
12
+ import aiohttp
13
+ import logging
14
+ from io import BytesIO
15
+
16
+ from cognee.base_config import get_base_config
17
+ from cognee.infrastructure.files.storage.get_file_storage import get_file_storage
18
+ from cognee.infrastructure.files.storage.StorageManager import StorageManager
19
+ from cognee.shared.utils import create_secure_ssl_context
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class StorageAwareCache:
25
+ """
26
+ A cache manager that works with different storage backends (local, S3, etc.)
27
+ """
28
+
29
+ def __init__(self, cache_subdir: str = "cache"):
30
+ """
31
+ Initialize the cache manager.
32
+
33
+ Args:
34
+ cache_subdir: Subdirectory name within the system root for caching
35
+ """
36
+ self.base_config = get_base_config()
37
+ # Since we're using cache_root_directory, don't add extra cache prefix
38
+ self.cache_base_path = ""
39
+ self.storage_manager: StorageManager = get_file_storage(
40
+ self.base_config.cache_root_directory
41
+ )
42
+
43
+ # Print absolute path
44
+ storage_path = self.storage_manager.storage.storage_path
45
+ if storage_path.startswith("s3://"):
46
+ absolute_path = storage_path # S3 paths are already absolute
47
+ else:
48
+ import os
49
+
50
+ absolute_path = os.path.abspath(storage_path)
51
+ logger.info(f"Storage manager absolute path: {absolute_path}")
52
+
53
+ async def get_cache_dir(self) -> str:
54
+ """Get the base cache directory path."""
55
+ cache_path = self.cache_base_path or "." # Use "." for root when cache_base_path is empty
56
+ await self.storage_manager.ensure_directory_exists(cache_path)
57
+ return cache_path
58
+
59
+ async def get_cache_subdir(self, name: str) -> str:
60
+ """Get a specific cache subdirectory."""
61
+ if self.cache_base_path:
62
+ cache_path = f"{self.cache_base_path}/{name}"
63
+ else:
64
+ cache_path = name
65
+ await self.storage_manager.ensure_directory_exists(cache_path)
66
+
67
+ # Return the absolute path based on storage system
68
+ if self.storage_manager.storage.storage_path.startswith("s3://"):
69
+ return cache_path
70
+ elif hasattr(self.storage_manager.storage, "storage_path"):
71
+ return f"{self.storage_manager.storage.storage_path}/{cache_path}"
72
+ else:
73
+ # Fallback for other storage types
74
+ return cache_path
75
+
76
+ async def delete_cache(self):
77
+ """Delete the entire cache directory."""
78
+ logger.info("Deleting cache...")
79
+ try:
80
+ await self.storage_manager.remove_all(self.cache_base_path)
81
+ logger.info("✓ Cache deleted successfully!")
82
+ except Exception as e:
83
+ logger.error(f"Error deleting cache: {e}")
84
+ raise
85
+
86
+ async def _is_cache_valid(self, cache_dir: str, version_or_hash: str) -> bool:
87
+ """Check if cached content is valid for the given version/hash."""
88
+ version_file = f"{cache_dir}/version.txt"
89
+
90
+ if not await self.storage_manager.file_exists(version_file):
91
+ return False
92
+
93
+ try:
94
+ async with self.storage_manager.open(version_file, "r") as f:
95
+ cached_version = (await asyncio.to_thread(f.read)).strip()
96
+ return cached_version == version_or_hash
97
+ except Exception as e:
98
+ logger.debug(f"Error checking cache validity: {e}")
99
+ return False
100
+
101
+ async def _clear_cache(self, cache_dir: str) -> None:
102
+ """Clear a cache directory."""
103
+ try:
104
+ await self.storage_manager.remove_all(cache_dir)
105
+ except Exception as e:
106
+ logger.debug(f"Error clearing cache directory {cache_dir}: {e}")
107
+
108
+ async def _check_remote_content_freshness(
109
+ self, url: str, cache_dir: str
110
+ ) -> Tuple[bool, Optional[str]]:
111
+ """
112
+ Check if remote content is fresher than cached version using HTTP headers.
113
+
114
+ Returns:
115
+ Tuple of (is_fresh: bool, new_identifier: Optional[str])
116
+ """
117
+ try:
118
+ # Make a HEAD request to check headers without downloading
119
+ ssl_context = create_secure_ssl_context()
120
+ connector = aiohttp.TCPConnector(ssl=ssl_context)
121
+ async with aiohttp.ClientSession(connector=connector) as session:
122
+ async with session.head(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
123
+ response.raise_for_status()
124
+
125
+ # Try ETag first (most reliable)
126
+ etag = response.headers.get("ETag", "").strip('"')
127
+ last_modified = response.headers.get("Last-Modified", "")
128
+
129
+ # Use ETag if available, otherwise Last-Modified
130
+ remote_identifier = etag if etag else last_modified
131
+
132
+ if not remote_identifier:
133
+ logger.debug("No freshness headers available, cannot check for updates")
134
+ return True, None # Assume fresh if no headers
135
+
136
+ # Check cached identifier
137
+ identifier_file = f"{cache_dir}/content_id.txt"
138
+ if await self.storage_manager.file_exists(identifier_file):
139
+ async with self.storage_manager.open(identifier_file, "r") as f:
140
+ cached_identifier = (await asyncio.to_thread(f.read)).strip()
141
+ if cached_identifier == remote_identifier:
142
+ logger.debug(f"Content is fresh (identifier: {remote_identifier[:20]}...)")
143
+ return True, None
144
+ else:
145
+ logger.info(
146
+ f"Content has changed (old: {cached_identifier[:20]}..., new: {remote_identifier[:20]}...)"
147
+ )
148
+ return False, remote_identifier
149
+ else:
150
+ # No cached identifier, treat as stale
151
+ return False, remote_identifier
152
+
153
+ except Exception as e:
154
+ logger.debug(f"Could not check remote freshness: {e}")
155
+ return True, None # Assume fresh if we can't check
156
+
157
+ async def download_and_extract_zip(
158
+ self, url: str, cache_subdir_name: str, version_or_hash: str, force: bool = False
159
+ ) -> str:
160
+ """
161
+ Download a zip file and extract it to cache directory with content freshness checking.
162
+
163
+ Args:
164
+ url: URL to download zip file from
165
+ cache_subdir_name: Name of the cache subdirectory
166
+ version_or_hash: Version string or content hash for cache validation
167
+ force: If True, re-download even if already cached
168
+
169
+ Returns:
170
+ Path to the cached directory
171
+ """
172
+ cache_dir = await self.get_cache_subdir(cache_subdir_name)
173
+
174
+ # Check if already cached and valid
175
+ if not force and await self._is_cache_valid(cache_dir, version_or_hash):
176
+ # Also check if remote content has changed
177
+ is_fresh, new_identifier = await self._check_remote_content_freshness(url, cache_dir)
178
+ if is_fresh:
179
+ logger.debug(f"Content already cached and fresh for version {version_or_hash}")
180
+ return cache_dir
181
+ else:
182
+ logger.info("Cached content is stale, updating...")
183
+
184
+ # Clear old cache if it exists
185
+ await self._clear_cache(cache_dir)
186
+
187
+ logger.info(f"Downloading content from {url}...")
188
+
189
+ # Download the zip file
190
+ zip_content = BytesIO()
191
+ etag = ""
192
+ last_modified = ""
193
+ ssl_context = create_secure_ssl_context()
194
+ connector = aiohttp.TCPConnector(ssl=ssl_context)
195
+ async with aiohttp.ClientSession(connector=connector) as session:
196
+ async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
197
+ response.raise_for_status()
198
+
199
+ # Extract headers before consuming response
200
+ etag = response.headers.get("ETag", "").strip('"')
201
+ last_modified = response.headers.get("Last-Modified", "")
202
+
203
+ # Read the response content
204
+ async for chunk in response.content.iter_chunked(8192):
205
+ zip_content.write(chunk)
206
+ zip_content.seek(0)
207
+
208
+ # Extract the archive
209
+ await self.storage_manager.ensure_directory_exists(cache_dir)
210
+
211
+ # Extract files and store them using StorageManager
212
+ with zipfile.ZipFile(zip_content, "r") as zip_file:
213
+ for file_info in zip_file.infolist():
214
+ if file_info.is_dir():
215
+ # Create directory
216
+ dir_path = f"{cache_dir}/{file_info.filename}"
217
+ await self.storage_manager.ensure_directory_exists(dir_path)
218
+ else:
219
+ # Extract and store file
220
+ file_data = zip_file.read(file_info.filename)
221
+ file_path = f"{cache_dir}/{file_info.filename}"
222
+ await self.storage_manager.store(file_path, BytesIO(file_data), overwrite=True)
223
+
224
+ # Write version info for future cache validation
225
+ version_file = f"{cache_dir}/version.txt"
226
+ await self.storage_manager.store(version_file, version_or_hash, overwrite=True)
227
+
228
+ # Store content identifier from response headers for freshness checking
229
+ content_identifier = etag if etag else last_modified
230
+
231
+ if content_identifier:
232
+ identifier_file = f"{cache_dir}/content_id.txt"
233
+ await self.storage_manager.store(identifier_file, content_identifier, overwrite=True)
234
+ logger.debug(f"Stored content identifier: {content_identifier[:20]}...")
235
+
236
+ logger.info("✓ Content downloaded and cached successfully!")
237
+ return cache_dir
238
+
239
+ async def file_exists(self, file_path: str) -> bool:
240
+ """Check if a file exists in cache storage."""
241
+ return await self.storage_manager.file_exists(file_path)
242
+
243
+ async def read_file(self, file_path: str, encoding: str = "utf-8"):
244
+ """Read a file from cache storage."""
245
+ return self.storage_manager.open(file_path, encoding=encoding)
246
+
247
+ async def list_files(self, directory_path: str):
248
+ """List files in a cache directory."""
249
+ try:
250
+ file_list = await self.storage_manager.list_files(directory_path)
251
+
252
+ # For S3 storage, convert relative paths to full S3 URLs
253
+ if self.storage_manager.storage.storage_path.startswith("s3://"):
254
+ full_paths = []
255
+ for file_path in file_list:
256
+ full_s3_path = f"{self.storage_manager.storage.storage_path}/{file_path}"
257
+ full_paths.append(full_s3_path)
258
+ return full_paths
259
+ else:
260
+ # For local storage, return absolute paths
261
+ storage_path = self.storage_manager.storage.storage_path
262
+ if not storage_path.startswith("/"):
263
+ import os
264
+
265
+ storage_path = os.path.abspath(storage_path)
266
+
267
+ full_paths = []
268
+ for file_path in file_list:
269
+ if file_path.startswith("/"):
270
+ full_paths.append(file_path) # Already absolute
271
+ else:
272
+ full_paths.append(f"{storage_path}/{file_path}")
273
+ return full_paths
274
+
275
+ except Exception as e:
276
+ logger.debug(f"Error listing files in {directory_path}: {e}")
277
+ return []
278
+
279
+
280
+ # Convenience functions that maintain API compatibility
281
+ _cache_manager = None
282
+
283
+
284
+ def get_cache_manager() -> StorageAwareCache:
285
+ """Get a singleton cache manager instance."""
286
+ global _cache_manager
287
+ if _cache_manager is None:
288
+ _cache_manager = StorageAwareCache()
289
+ return _cache_manager
290
+
291
+
292
+ def generate_content_hash(url: str, additional_data: str = "") -> str:
293
+ """Generate a content hash from URL and optional additional data."""
294
+ content = f"{url}:{additional_data}"
295
+ return hashlib.md5(content.encode()).hexdigest()[:12] # Short hash for readability
296
+
297
+
298
+ # Async wrapper functions for backward compatibility
299
+ async def delete_cache():
300
+ """Delete the Cognee cache directory."""
301
+ cache_manager = get_cache_manager()
302
+ await cache_manager.delete_cache()
303
+
304
+
305
+ async def get_cognee_cache_dir() -> str:
306
+ """Get the base Cognee cache directory."""
307
+ cache_manager = get_cache_manager()
308
+ return await cache_manager.get_cache_dir()
309
+
310
+
311
+ async def get_cache_subdir(name: str) -> str:
312
+ """Get a specific cache subdirectory."""
313
+ cache_manager = get_cache_manager()
314
+ return await cache_manager.get_cache_subdir(name)
315
+
316
+
317
+ async def download_and_extract_zip(
318
+ url: str, cache_dir_name: str, version_or_hash: str, force: bool = False
319
+ ) -> str:
320
+ """Download a zip file and extract it to cache directory."""
321
+ cache_manager = get_cache_manager()
322
+ return await cache_manager.download_and_extract_zip(url, cache_dir_name, version_or_hash, force)
323
+
324
+
325
+ async def get_tutorial_data_dir() -> str:
326
+ """Get the tutorial data cache directory."""
327
+ return await get_cache_subdir("tutorial_data")
328
+
329
+
330
+ # Cache file operations
331
+ async def cache_file_exists(file_path: str) -> bool:
332
+ """Check if a file exists in cache storage."""
333
+ cache_manager = get_cache_manager()
334
+ return await cache_manager.file_exists(file_path)
335
+
336
+
337
+ async def read_cache_file(file_path: str, encoding: str = "utf-8"):
338
+ """Read a file from cache storage."""
339
+ cache_manager = get_cache_manager()
340
+ return await cache_manager.read_file(file_path, encoding)
341
+
342
+
343
+ async def list_cache_files(directory_path: str):
344
+ """List files in a cache directory."""
345
+ cache_manager = get_cache_manager()
346
+ return await cache_manager.list_files(directory_path)
cognee/shared/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """This module contains utility functions for the cognee."""
2
2
 
3
3
  import os
4
+ import ssl
4
5
  import requests
5
6
  from datetime import datetime, timezone
6
7
  import matplotlib.pyplot as plt
@@ -18,6 +19,17 @@ from cognee.infrastructure.databases.graph import get_graph_engine
18
19
  proxy_url = "https://test.prometh.ai"
19
20
 
20
21
 
22
+ def create_secure_ssl_context() -> ssl.SSLContext:
23
+ """
24
+ Create a secure SSL context.
25
+
26
+ By default, use the system's certificate store.
27
+ If users report SSL issues, I'm keeping this open in case we need to switch to:
28
+ ssl.create_default_context(cafile=certifi.where())
29
+ """
30
+ return ssl.create_default_context()
31
+
32
+
21
33
  def get_entities(tagged_tokens):
22
34
  import nltk
23
35