cognee 0.2.4__py3-none-any.whl → 0.3.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/client.py +28 -3
  3. cognee/api/health.py +10 -13
  4. cognee/api/v1/add/add.py +3 -1
  5. cognee/api/v1/add/routers/get_add_router.py +12 -37
  6. cognee/api/v1/cloud/routers/__init__.py +1 -0
  7. cognee/api/v1/cloud/routers/get_checks_router.py +23 -0
  8. cognee/api/v1/cognify/code_graph_pipeline.py +9 -4
  9. cognee/api/v1/cognify/cognify.py +50 -3
  10. cognee/api/v1/cognify/routers/get_cognify_router.py +1 -1
  11. cognee/api/v1/datasets/routers/get_datasets_router.py +15 -4
  12. cognee/api/v1/memify/__init__.py +0 -0
  13. cognee/api/v1/memify/routers/__init__.py +1 -0
  14. cognee/api/v1/memify/routers/get_memify_router.py +100 -0
  15. cognee/api/v1/notebooks/routers/__init__.py +1 -0
  16. cognee/api/v1/notebooks/routers/get_notebooks_router.py +96 -0
  17. cognee/api/v1/search/routers/get_search_router.py +20 -1
  18. cognee/api/v1/search/search.py +11 -4
  19. cognee/api/v1/sync/__init__.py +17 -0
  20. cognee/api/v1/sync/routers/__init__.py +3 -0
  21. cognee/api/v1/sync/routers/get_sync_router.py +241 -0
  22. cognee/api/v1/sync/sync.py +877 -0
  23. cognee/api/v1/users/routers/get_auth_router.py +13 -1
  24. cognee/base_config.py +10 -1
  25. cognee/infrastructure/databases/graph/config.py +10 -4
  26. cognee/infrastructure/databases/graph/kuzu/adapter.py +135 -0
  27. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +89 -0
  28. cognee/infrastructure/databases/relational/__init__.py +2 -0
  29. cognee/infrastructure/databases/relational/get_async_session.py +15 -0
  30. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +6 -1
  31. cognee/infrastructure/databases/relational/with_async_session.py +25 -0
  32. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +1 -1
  33. cognee/infrastructure/databases/vector/config.py +13 -6
  34. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +1 -1
  35. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +2 -6
  36. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +4 -1
  37. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -0
  38. cognee/infrastructure/files/storage/S3FileStorage.py +5 -0
  39. cognee/infrastructure/files/storage/StorageManager.py +7 -1
  40. cognee/infrastructure/files/storage/storage.py +16 -0
  41. cognee/infrastructure/llm/LLMGateway.py +18 -0
  42. cognee/infrastructure/llm/config.py +4 -2
  43. cognee/infrastructure/llm/prompts/extract_query_time.txt +15 -0
  44. cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +25 -0
  45. cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +30 -0
  46. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +2 -0
  47. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py +44 -0
  48. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py +1 -0
  49. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py +46 -0
  50. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +25 -1
  51. cognee/infrastructure/utils/run_sync.py +8 -1
  52. cognee/modules/chunking/models/DocumentChunk.py +4 -3
  53. cognee/modules/cloud/exceptions/CloudApiKeyMissingError.py +15 -0
  54. cognee/modules/cloud/exceptions/CloudConnectionError.py +15 -0
  55. cognee/modules/cloud/exceptions/__init__.py +2 -0
  56. cognee/modules/cloud/operations/__init__.py +1 -0
  57. cognee/modules/cloud/operations/check_api_key.py +25 -0
  58. cognee/modules/data/deletion/prune_system.py +1 -1
  59. cognee/modules/data/methods/check_dataset_name.py +1 -1
  60. cognee/modules/data/methods/get_dataset_data.py +1 -1
  61. cognee/modules/data/methods/load_or_create_datasets.py +1 -1
  62. cognee/modules/engine/models/Event.py +16 -0
  63. cognee/modules/engine/models/Interval.py +8 -0
  64. cognee/modules/engine/models/Timestamp.py +13 -0
  65. cognee/modules/engine/models/__init__.py +3 -0
  66. cognee/modules/engine/utils/__init__.py +2 -0
  67. cognee/modules/engine/utils/generate_event_datapoint.py +46 -0
  68. cognee/modules/engine/utils/generate_timestamp_datapoint.py +51 -0
  69. cognee/modules/graph/cognee_graph/CogneeGraph.py +2 -2
  70. cognee/modules/graph/utils/__init__.py +1 -0
  71. cognee/modules/graph/utils/resolve_edges_to_text.py +71 -0
  72. cognee/modules/memify/__init__.py +1 -0
  73. cognee/modules/memify/memify.py +118 -0
  74. cognee/modules/notebooks/methods/__init__.py +5 -0
  75. cognee/modules/notebooks/methods/create_notebook.py +26 -0
  76. cognee/modules/notebooks/methods/delete_notebook.py +13 -0
  77. cognee/modules/notebooks/methods/get_notebook.py +21 -0
  78. cognee/modules/notebooks/methods/get_notebooks.py +18 -0
  79. cognee/modules/notebooks/methods/update_notebook.py +17 -0
  80. cognee/modules/notebooks/models/Notebook.py +53 -0
  81. cognee/modules/notebooks/models/__init__.py +1 -0
  82. cognee/modules/notebooks/operations/__init__.py +1 -0
  83. cognee/modules/notebooks/operations/run_in_local_sandbox.py +55 -0
  84. cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +19 -3
  85. cognee/modules/pipelines/operations/pipeline.py +1 -0
  86. cognee/modules/pipelines/operations/run_tasks.py +17 -41
  87. cognee/modules/retrieval/base_graph_retriever.py +18 -0
  88. cognee/modules/retrieval/base_retriever.py +1 -1
  89. cognee/modules/retrieval/code_retriever.py +8 -0
  90. cognee/modules/retrieval/coding_rules_retriever.py +31 -0
  91. cognee/modules/retrieval/completion_retriever.py +9 -3
  92. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -0
  93. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +23 -14
  94. cognee/modules/retrieval/graph_completion_cot_retriever.py +21 -11
  95. cognee/modules/retrieval/graph_completion_retriever.py +32 -65
  96. cognee/modules/retrieval/graph_summary_completion_retriever.py +3 -1
  97. cognee/modules/retrieval/insights_retriever.py +14 -3
  98. cognee/modules/retrieval/summaries_retriever.py +1 -1
  99. cognee/modules/retrieval/temporal_retriever.py +152 -0
  100. cognee/modules/retrieval/utils/brute_force_triplet_search.py +7 -32
  101. cognee/modules/retrieval/utils/completion.py +10 -3
  102. cognee/modules/search/methods/get_search_type_tools.py +168 -0
  103. cognee/modules/search/methods/no_access_control_search.py +47 -0
  104. cognee/modules/search/methods/search.py +219 -139
  105. cognee/modules/search/types/SearchResult.py +21 -0
  106. cognee/modules/search/types/SearchType.py +2 -0
  107. cognee/modules/search/types/__init__.py +1 -0
  108. cognee/modules/search/utils/__init__.py +2 -0
  109. cognee/modules/search/utils/prepare_search_result.py +41 -0
  110. cognee/modules/search/utils/transform_context_to_graph.py +38 -0
  111. cognee/modules/sync/__init__.py +1 -0
  112. cognee/modules/sync/methods/__init__.py +23 -0
  113. cognee/modules/sync/methods/create_sync_operation.py +53 -0
  114. cognee/modules/sync/methods/get_sync_operation.py +107 -0
  115. cognee/modules/sync/methods/update_sync_operation.py +248 -0
  116. cognee/modules/sync/models/SyncOperation.py +142 -0
  117. cognee/modules/sync/models/__init__.py +3 -0
  118. cognee/modules/users/__init__.py +0 -1
  119. cognee/modules/users/methods/__init__.py +4 -1
  120. cognee/modules/users/methods/create_user.py +26 -1
  121. cognee/modules/users/methods/get_authenticated_user.py +36 -42
  122. cognee/modules/users/methods/get_default_user.py +3 -1
  123. cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +2 -1
  124. cognee/root_dir.py +19 -0
  125. cognee/shared/logging_utils.py +1 -1
  126. cognee/tasks/codingagents/__init__.py +0 -0
  127. cognee/tasks/codingagents/coding_rule_associations.py +127 -0
  128. cognee/tasks/ingestion/save_data_item_to_storage.py +23 -0
  129. cognee/tasks/memify/__init__.py +2 -0
  130. cognee/tasks/memify/extract_subgraph.py +7 -0
  131. cognee/tasks/memify/extract_subgraph_chunks.py +11 -0
  132. cognee/tasks/repo_processor/get_repo_file_dependencies.py +52 -27
  133. cognee/tasks/temporal_graph/__init__.py +1 -0
  134. cognee/tasks/temporal_graph/add_entities_to_event.py +85 -0
  135. cognee/tasks/temporal_graph/enrich_events.py +34 -0
  136. cognee/tasks/temporal_graph/extract_events_and_entities.py +32 -0
  137. cognee/tasks/temporal_graph/extract_knowledge_graph_from_events.py +41 -0
  138. cognee/tasks/temporal_graph/models.py +49 -0
  139. cognee/tests/test_kuzu.py +4 -4
  140. cognee/tests/test_neo4j.py +4 -4
  141. cognee/tests/test_permissions.py +3 -3
  142. cognee/tests/test_relational_db_migration.py +7 -5
  143. cognee/tests/test_search_db.py +18 -24
  144. cognee/tests/test_temporal_graph.py +167 -0
  145. cognee/tests/unit/api/__init__.py +1 -0
  146. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +246 -0
  147. cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +18 -2
  148. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +13 -16
  149. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +11 -16
  150. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +5 -4
  151. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +4 -2
  152. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +18 -2
  153. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +225 -0
  154. cognee/tests/unit/modules/users/__init__.py +1 -0
  155. cognee/tests/unit/modules/users/test_conditional_authentication.py +277 -0
  156. cognee/tests/unit/processing/utils/utils_test.py +20 -1
  157. {cognee-0.2.4.dist-info → cognee-0.3.0.dev0.dist-info}/METADATA +8 -6
  158. {cognee-0.2.4.dist-info → cognee-0.3.0.dev0.dist-info}/RECORD +162 -89
  159. cognee/tests/unit/modules/search/search_methods_test.py +0 -225
  160. {cognee-0.2.4.dist-info → cognee-0.3.0.dev0.dist-info}/WHEEL +0 -0
  161. {cognee-0.2.4.dist-info → cognee-0.3.0.dev0.dist-info}/entry_points.txt +0 -0
  162. {cognee-0.2.4.dist-info → cognee-0.3.0.dev0.dist-info}/licenses/LICENSE +0 -0
  163. {cognee-0.2.4.dist-info → cognee-0.3.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,877 @@
1
+ import io
2
+ import os
3
+ import uuid
4
+ import asyncio
5
+ import aiohttp
6
+ from pydantic import BaseModel
7
+ from typing import List, Optional
8
+ from datetime import datetime, timezone
9
+ from dataclasses import dataclass
10
+
11
+ from cognee.api.v1.cognify import cognify
12
+
13
+ from cognee.infrastructure.files.storage import get_file_storage
14
+ from cognee.tasks.ingestion.ingest_data import ingest_data
15
+ from cognee.shared.logging_utils import get_logger
16
+ from cognee.modules.users.models import User
17
+ from cognee.modules.data.models import Dataset
18
+ from cognee.modules.data.methods import get_dataset_data
19
+ from cognee.modules.sync.methods import (
20
+ create_sync_operation,
21
+ update_sync_operation,
22
+ mark_sync_started,
23
+ mark_sync_completed,
24
+ mark_sync_failed,
25
+ )
26
+
27
+ logger = get_logger("sync")
28
+
29
+
30
+ async def _safe_update_progress(run_id: str, stage: str, **kwargs):
31
+ """
32
+ Safely update sync progress with better error handling and context.
33
+
34
+ Args:
35
+ run_id: Sync operation run ID
36
+ progress_percentage: Progress percentage (0-100)
37
+ stage: Description of current stage for logging
38
+ **kwargs: Additional fields to update (records_downloaded, records_uploaded, etc.)
39
+ """
40
+ try:
41
+ await update_sync_operation(run_id, **kwargs)
42
+ logger.info(f"Sync {run_id}: Progress updated during {stage}")
43
+ except Exception as e:
44
+ # Log error but don't fail the sync - progress updates are nice-to-have
45
+ logger.warning(
46
+ f"Sync {run_id}: Non-critical progress update failed during {stage}: {str(e)}"
47
+ )
48
+ # Continue without raising - sync operation is more important than progress tracking
49
+
50
+
51
+ class LocalFileInfo(BaseModel):
52
+ """Model for local file information with hash."""
53
+
54
+ id: str
55
+ name: str
56
+ mime_type: Optional[str]
57
+ extension: Optional[str]
58
+ raw_data_location: str
59
+ content_hash: str # MD5 hash
60
+ file_size: int
61
+ node_set: Optional[str] = None
62
+
63
+
64
+ class CheckMissingHashesRequest(BaseModel):
65
+ """Request model for checking missing hashes in a dataset"""
66
+
67
+ dataset_id: str
68
+ dataset_name: str
69
+ hashes: List[str]
70
+
71
+
72
+ class CheckHashesDiffResponse(BaseModel):
73
+ """Response model for missing hashes check"""
74
+
75
+ missing_on_remote: List[str]
76
+ missing_on_local: List[str]
77
+
78
+
79
+ class PruneDatasetRequest(BaseModel):
80
+ """Request model for pruning dataset to specific hashes"""
81
+
82
+ items: List[str]
83
+
84
+
85
+ class SyncResponse(BaseModel):
86
+ """Response model for sync operations."""
87
+
88
+ run_id: str
89
+ status: str # "started" for immediate response
90
+ dataset_ids: List[str]
91
+ dataset_names: List[str]
92
+ message: str
93
+ timestamp: str
94
+ user_id: str
95
+
96
+
97
+ async def sync(
98
+ datasets: List[Dataset],
99
+ user: User,
100
+ ) -> SyncResponse:
101
+ """
102
+ Sync local Cognee data to Cognee Cloud.
103
+
104
+ This function handles synchronization of multiple datasets, knowledge graphs, and
105
+ processed data to the Cognee Cloud infrastructure. It uploads local data for
106
+ cloud-based processing, backup, and sharing.
107
+
108
+ Args:
109
+ datasets: List of Dataset objects to sync (permissions already verified)
110
+ user: User object for authentication and permissions
111
+
112
+ Returns:
113
+ SyncResponse model with immediate response:
114
+ - run_id: Unique identifier for tracking this sync operation
115
+ - status: Always "started" (sync runs in background)
116
+ - dataset_ids: List of dataset IDs being synced
117
+ - dataset_names: List of dataset names being synced
118
+ - message: Description of what's happening
119
+ - timestamp: When the sync was initiated
120
+ - user_id: User who initiated the sync
121
+
122
+ Raises:
123
+ ConnectionError: If Cognee Cloud service is unreachable
124
+ Exception: For other sync-related errors
125
+ """
126
+ if not datasets:
127
+ raise ValueError("At least one dataset must be provided for sync operation")
128
+
129
+ # Generate a unique run ID
130
+ run_id = str(uuid.uuid4())
131
+
132
+ # Get current timestamp
133
+ timestamp = datetime.now(timezone.utc).isoformat()
134
+
135
+ dataset_info = ", ".join([f"{d.name} ({d.id})" for d in datasets])
136
+ logger.info(f"Starting cloud sync operation {run_id}: datasets {dataset_info}")
137
+
138
+ # Create sync operation record in database (total_records will be set during background sync)
139
+ try:
140
+ await create_sync_operation(
141
+ run_id=run_id,
142
+ dataset_ids=[d.id for d in datasets],
143
+ dataset_names=[d.name for d in datasets],
144
+ user_id=user.id,
145
+ )
146
+ logger.info(f"Created sync operation record for {run_id}")
147
+ except Exception as e:
148
+ logger.error(f"Failed to create sync operation record: {str(e)}")
149
+ # Continue without database tracking if record creation fails
150
+
151
+ # Start the sync operation in the background
152
+ asyncio.create_task(_perform_background_sync(run_id, datasets, user))
153
+
154
+ # Return immediately with run_id
155
+ return SyncResponse(
156
+ run_id=run_id,
157
+ status="started",
158
+ dataset_ids=[str(d.id) for d in datasets],
159
+ dataset_names=[d.name for d in datasets],
160
+ message=f"Sync operation started in background for {len(datasets)} datasets. Use run_id '{run_id}' to track progress.",
161
+ timestamp=timestamp,
162
+ user_id=str(user.id),
163
+ )
164
+
165
+
166
+ async def _perform_background_sync(run_id: str, datasets: List[Dataset], user: User) -> None:
167
+ """Perform the actual sync operation in the background for multiple datasets."""
168
+ start_time = datetime.now(timezone.utc)
169
+
170
+ try:
171
+ dataset_info = ", ".join([f"{d.name} ({d.id})" for d in datasets])
172
+ logger.info(f"Background sync {run_id}: Starting sync for datasets {dataset_info}")
173
+
174
+ # Mark sync as in progress
175
+ await mark_sync_started(run_id)
176
+
177
+ # Perform the actual sync operation
178
+ MAX_RETRY_COUNT = 3
179
+ retry_count = 0
180
+ while retry_count < MAX_RETRY_COUNT:
181
+ try:
182
+ (
183
+ records_downloaded,
184
+ records_uploaded,
185
+ bytes_downloaded,
186
+ bytes_uploaded,
187
+ dataset_sync_hashes,
188
+ ) = await _sync_to_cognee_cloud(datasets, user, run_id)
189
+ break
190
+ except Exception as e:
191
+ retry_count += 1
192
+ logger.error(
193
+ f"Background sync {run_id}: Failed after {retry_count} retries with error: {str(e)}"
194
+ )
195
+ await update_sync_operation(run_id, retry_count=retry_count)
196
+ await asyncio.sleep(2**retry_count)
197
+ continue
198
+
199
+ if retry_count == MAX_RETRY_COUNT:
200
+ logger.error(f"Background sync {run_id}: Failed after {MAX_RETRY_COUNT} retries")
201
+ await mark_sync_failed(run_id, "Failed after 3 retries")
202
+ return
203
+
204
+ end_time = datetime.now(timezone.utc)
205
+ duration = (end_time - start_time).total_seconds()
206
+
207
+ logger.info(
208
+ f"Background sync {run_id}: Completed successfully. Downloaded: {records_downloaded} records/{bytes_downloaded} bytes, Uploaded: {records_uploaded} records/{bytes_uploaded} bytes, Duration: {duration}s"
209
+ )
210
+
211
+ # Mark sync as completed with final stats and data lineage
212
+ await mark_sync_completed(
213
+ run_id,
214
+ records_downloaded,
215
+ records_uploaded,
216
+ bytes_downloaded,
217
+ bytes_uploaded,
218
+ dataset_sync_hashes,
219
+ )
220
+
221
+ except Exception as e:
222
+ end_time = datetime.now(timezone.utc)
223
+ duration = (end_time - start_time).total_seconds()
224
+
225
+ logger.error(f"Background sync {run_id}: Failed after {duration}s with error: {str(e)}")
226
+
227
+ # Mark sync as failed with error message
228
+ await mark_sync_failed(run_id, str(e))
229
+
230
+
231
+ async def _sync_to_cognee_cloud(
232
+ datasets: List[Dataset], user: User, run_id: str
233
+ ) -> tuple[int, int, int, int, dict]:
234
+ """
235
+ Sync local data to Cognee Cloud using three-step idempotent process:
236
+ 1. Extract local files with stored MD5 hashes and check what's missing on cloud
237
+ 2. Upload missing files individually
238
+ 3. Prune cloud dataset to match local state
239
+ """
240
+ dataset_info = ", ".join([f"{d.name} ({d.id})" for d in datasets])
241
+ logger.info(f"Starting sync to Cognee Cloud: datasets {dataset_info}")
242
+
243
+ total_records_downloaded = 0
244
+ total_records_uploaded = 0
245
+ total_bytes_downloaded = 0
246
+ total_bytes_uploaded = 0
247
+ dataset_sync_hashes = {}
248
+
249
+ try:
250
+ # Get cloud configuration
251
+ cloud_base_url = await _get_cloud_base_url()
252
+ cloud_auth_token = await _get_cloud_auth_token(user)
253
+
254
+ # Step 1: Sync files for all datasets concurrently
255
+ sync_files_tasks = [
256
+ _sync_dataset_files(dataset, cloud_base_url, cloud_auth_token, user, run_id)
257
+ for dataset in datasets
258
+ ]
259
+
260
+ logger.info(f"Starting concurrent file sync for {len(datasets)} datasets")
261
+
262
+ has_any_uploads = False
263
+ has_any_downloads = False
264
+ processed_datasets = []
265
+ completed_datasets = 0
266
+
267
+ # Process datasets concurrently and accumulate results
268
+ for completed_task in asyncio.as_completed(sync_files_tasks):
269
+ try:
270
+ dataset_result = await completed_task
271
+ completed_datasets += 1
272
+
273
+ # Update progress based on completed datasets (0-80% for file sync)
274
+ file_sync_progress = int((completed_datasets / len(datasets)) * 80)
275
+ await _safe_update_progress(
276
+ run_id, "file_sync", progress_percentage=file_sync_progress
277
+ )
278
+
279
+ if dataset_result is None:
280
+ logger.info(
281
+ f"Progress: {completed_datasets}/{len(datasets)} datasets processed ({file_sync_progress}%)"
282
+ )
283
+ continue
284
+
285
+ total_records_downloaded += dataset_result.records_downloaded
286
+ total_records_uploaded += dataset_result.records_uploaded
287
+ total_bytes_downloaded += dataset_result.bytes_downloaded
288
+ total_bytes_uploaded += dataset_result.bytes_uploaded
289
+
290
+ # Build per-dataset hash tracking for data lineage
291
+ dataset_sync_hashes[dataset_result.dataset_id] = {
292
+ "uploaded": dataset_result.uploaded_hashes,
293
+ "downloaded": dataset_result.downloaded_hashes,
294
+ }
295
+
296
+ if dataset_result.has_uploads:
297
+ has_any_uploads = True
298
+ if dataset_result.has_downloads:
299
+ has_any_downloads = True
300
+
301
+ processed_datasets.append(dataset_result.dataset_id)
302
+
303
+ logger.info(
304
+ f"Progress: {completed_datasets}/{len(datasets)} datasets processed ({file_sync_progress}%) - "
305
+ f"Completed file sync for dataset {dataset_result.dataset_name}: "
306
+ f"↑{dataset_result.records_uploaded} files ({dataset_result.bytes_uploaded} bytes), "
307
+ f"↓{dataset_result.records_downloaded} files ({dataset_result.bytes_downloaded} bytes)"
308
+ )
309
+ except Exception as e:
310
+ completed_datasets += 1
311
+ logger.error(f"Dataset file sync failed: {str(e)}")
312
+ # Update progress even for failed datasets
313
+ file_sync_progress = int((completed_datasets / len(datasets)) * 80)
314
+ await _safe_update_progress(
315
+ run_id, "file_sync", progress_percentage=file_sync_progress
316
+ )
317
+ # Continue with other datasets even if one fails
318
+
319
+ # Step 2: Trigger cognify processing once for all datasets (only if any files were uploaded)
320
+ # Update progress to 90% before cognify
321
+ await _safe_update_progress(run_id, "cognify", progress_percentage=90)
322
+
323
+ if has_any_uploads and processed_datasets:
324
+ logger.info(
325
+ f"Progress: 90% - Triggering cognify processing for {len(processed_datasets)} datasets with new files"
326
+ )
327
+ try:
328
+ # Trigger cognify for all datasets at once - use first dataset as reference point
329
+ await _trigger_remote_cognify(
330
+ cloud_base_url, cloud_auth_token, datasets[0].id, run_id
331
+ )
332
+ logger.info("Cognify processing triggered successfully for all datasets")
333
+ except Exception as e:
334
+ logger.warning(f"Failed to trigger cognify processing: {str(e)}")
335
+ # Don't fail the entire sync if cognify fails
336
+ else:
337
+ logger.info(
338
+ "Progress: 90% - Skipping cognify processing - no new files were uploaded across any datasets"
339
+ )
340
+
341
+ # Step 3: Trigger local cognify processing if any files were downloaded
342
+ if has_any_downloads and processed_datasets:
343
+ logger.info(
344
+ f"Progress: 95% - Triggering local cognify processing for {len(processed_datasets)} datasets with downloaded files"
345
+ )
346
+ try:
347
+ await cognify()
348
+ logger.info("Local cognify processing completed successfully for all datasets")
349
+ except Exception as e:
350
+ logger.warning(f"Failed to run local cognify processing: {str(e)}")
351
+ # Don't fail the entire sync if local cognify fails
352
+ else:
353
+ logger.info(
354
+ "Progress: 95% - Skipping local cognify processing - no new files were downloaded across any datasets"
355
+ )
356
+
357
+ # Update final progress
358
+ try:
359
+ await _safe_update_progress(
360
+ run_id,
361
+ "final",
362
+ progress_percentage=100,
363
+ total_records_to_sync=total_records_uploaded + total_records_downloaded,
364
+ total_records_to_download=total_records_downloaded,
365
+ total_records_to_upload=total_records_uploaded,
366
+ records_downloaded=total_records_downloaded,
367
+ records_uploaded=total_records_uploaded,
368
+ )
369
+ except Exception as e:
370
+ logger.warning(f"Failed to update final sync progress: {str(e)}")
371
+
372
+ logger.info(
373
+ f"Multi-dataset sync completed: {len(datasets)} datasets processed, downloaded {total_records_downloaded} records/{total_bytes_downloaded} bytes, uploaded {total_records_uploaded} records/{total_bytes_uploaded} bytes"
374
+ )
375
+
376
+ return (
377
+ total_records_downloaded,
378
+ total_records_uploaded,
379
+ total_bytes_downloaded,
380
+ total_bytes_uploaded,
381
+ dataset_sync_hashes,
382
+ )
383
+
384
+ except Exception as e:
385
+ logger.error(f"Sync failed: {str(e)}")
386
+ raise ConnectionError(f"Cloud sync failed: {str(e)}")
387
+
388
+
389
+ @dataclass
390
+ class DatasetSyncResult:
391
+ """Result of syncing files for a single dataset."""
392
+
393
+ dataset_name: str
394
+ dataset_id: str
395
+ records_downloaded: int
396
+ records_uploaded: int
397
+ bytes_downloaded: int
398
+ bytes_uploaded: int
399
+ has_uploads: bool # Whether any files were uploaded (for cognify decision)
400
+ has_downloads: bool # Whether any files were downloaded (for cognify decision)
401
+ uploaded_hashes: List[str] # Content hashes of files uploaded during sync
402
+ downloaded_hashes: List[str] # Content hashes of files downloaded during sync
403
+
404
+
405
+ async def _sync_dataset_files(
406
+ dataset: Dataset, cloud_base_url: str, cloud_auth_token: str, user: User, run_id: str
407
+ ) -> Optional[DatasetSyncResult]:
408
+ """
409
+ Sync files for a single dataset (2-way: upload to cloud, download from cloud).
410
+ Does NOT trigger cognify - that's done separately once for all datasets.
411
+
412
+ Returns:
413
+ DatasetSyncResult with sync results or None if dataset was empty
414
+ """
415
+ logger.info(f"Syncing files for dataset: {dataset.name} ({dataset.id})")
416
+
417
+ try:
418
+ # Step 1: Extract local file info with stored hashes
419
+ local_files = await _extract_local_files_with_hashes(dataset, user, run_id)
420
+ logger.info(f"Found {len(local_files)} local files for dataset {dataset.name}")
421
+
422
+ if not local_files:
423
+ logger.info(f"No files to sync for dataset {dataset.name} - skipping")
424
+ return None
425
+
426
+ # Step 2: Check what files are missing on cloud
427
+ local_hashes = [f.content_hash for f in local_files]
428
+ hashes_diff_response = await _check_hashes_diff(
429
+ cloud_base_url, cloud_auth_token, dataset, local_hashes, run_id
430
+ )
431
+
432
+ hashes_missing_on_remote = hashes_diff_response.missing_on_remote
433
+ hashes_missing_on_local = hashes_diff_response.missing_on_local
434
+
435
+ logger.info(
436
+ f"Dataset {dataset.name}: {len(hashes_missing_on_remote)} files to upload, {len(hashes_missing_on_local)} files to download"
437
+ )
438
+
439
+ # Step 3: Upload files that are missing on cloud
440
+ bytes_uploaded = await _upload_missing_files(
441
+ cloud_base_url, cloud_auth_token, dataset, local_files, hashes_missing_on_remote, run_id
442
+ )
443
+ logger.info(
444
+ f"Dataset {dataset.name}: Upload complete - {len(hashes_missing_on_remote)} files, {bytes_uploaded} bytes"
445
+ )
446
+
447
+ # Step 4: Download files that are missing locally
448
+ bytes_downloaded = await _download_missing_files(
449
+ cloud_base_url, cloud_auth_token, dataset, hashes_missing_on_local, user
450
+ )
451
+ logger.info(
452
+ f"Dataset {dataset.name}: Download complete - {len(hashes_missing_on_local)} files, {bytes_downloaded} bytes"
453
+ )
454
+
455
+ return DatasetSyncResult(
456
+ dataset_name=dataset.name,
457
+ dataset_id=str(dataset.id),
458
+ records_downloaded=len(hashes_missing_on_local),
459
+ records_uploaded=len(hashes_missing_on_remote),
460
+ bytes_downloaded=bytes_downloaded,
461
+ bytes_uploaded=bytes_uploaded,
462
+ has_uploads=len(hashes_missing_on_remote) > 0,
463
+ has_downloads=len(hashes_missing_on_local) > 0,
464
+ uploaded_hashes=hashes_missing_on_remote,
465
+ downloaded_hashes=hashes_missing_on_local,
466
+ )
467
+
468
+ except Exception as e:
469
+ logger.error(f"Failed to sync files for dataset {dataset.name} ({dataset.id}): {str(e)}")
470
+ raise # Re-raise to be handled by the caller
471
+
472
+
473
+ async def _extract_local_files_with_hashes(
474
+ dataset: Dataset, user: User, run_id: str
475
+ ) -> List[LocalFileInfo]:
476
+ """
477
+ Extract local dataset data with existing MD5 hashes from database.
478
+
479
+ Args:
480
+ dataset: Dataset to extract files from
481
+ user: User performing the sync
482
+ run_id: Unique identifier for this sync operation
483
+
484
+ Returns:
485
+ List[LocalFileInfo]: Information about each local file with stored hash
486
+ """
487
+ try:
488
+ logger.info(f"Extracting files from dataset: {dataset.name} ({dataset.id})")
489
+
490
+ # Get all data entries linked to this dataset
491
+ data_entries = await get_dataset_data(dataset.id)
492
+ logger.info(f"Found {len(data_entries)} data entries in dataset")
493
+
494
+ # Process each data entry to get file info and hash
495
+ local_files: List[LocalFileInfo] = []
496
+ skipped_count = 0
497
+
498
+ for data_entry in data_entries:
499
+ try:
500
+ # Use existing content_hash from database
501
+ content_hash = data_entry.raw_content_hash
502
+ file_size = data_entry.data_size if data_entry.data_size else 0
503
+
504
+ # Skip entries without content hash (shouldn't happen in normal cases)
505
+ if not content_hash:
506
+ skipped_count += 1
507
+ logger.warning(
508
+ f"Skipping file {data_entry.name}: missing content_hash in database"
509
+ )
510
+ continue
511
+
512
+ if file_size == 0:
513
+ # Get file size from filesystem if not stored
514
+ file_size = await _get_file_size(data_entry.raw_data_location)
515
+
516
+ local_files.append(
517
+ LocalFileInfo(
518
+ id=str(data_entry.id),
519
+ name=data_entry.name,
520
+ mime_type=data_entry.mime_type,
521
+ extension=data_entry.extension,
522
+ raw_data_location=data_entry.raw_data_location,
523
+ content_hash=content_hash,
524
+ file_size=file_size,
525
+ node_set=data_entry.node_set,
526
+ )
527
+ )
528
+
529
+ except Exception as e:
530
+ skipped_count += 1
531
+ logger.warning(f"Failed to process file {data_entry.name}: {str(e)}")
532
+ # Continue with other entries even if one fails
533
+ continue
534
+
535
+ logger.info(
536
+ f"File extraction complete: {len(local_files)} files processed, {skipped_count} skipped"
537
+ )
538
+ return local_files
539
+
540
+ except Exception as e:
541
+ logger.error(f"Failed to extract files from dataset {dataset.name}: {str(e)}")
542
+ raise
543
+
544
+
545
+ async def _get_file_size(file_path: str) -> int:
546
+ """Get file size in bytes."""
547
+ try:
548
+ file_dir = os.path.dirname(file_path)
549
+ file_name = os.path.basename(file_path)
550
+ file_storage = get_file_storage(file_dir)
551
+
552
+ return await file_storage.get_size(file_name)
553
+ except Exception:
554
+ return 0
555
+
556
+
557
+ async def _get_cloud_base_url() -> str:
558
+ """Get Cognee Cloud API base URL."""
559
+ return os.getenv("COGNEE_CLOUD_API_URL", "http://localhost:8001")
560
+
561
+
562
+ async def _get_cloud_auth_token(user: User) -> str:
563
+ """Get authentication token for Cognee Cloud API."""
564
+ return os.getenv("COGNEE_CLOUD_AUTH_TOKEN", "your-auth-token")
565
+
566
+
567
+ async def _check_hashes_diff(
568
+ cloud_base_url: str, auth_token: str, dataset: Dataset, local_hashes: List[str], run_id: str
569
+ ) -> CheckHashesDiffResponse:
570
+ """
571
+ Check which hashes are missing on cloud.
572
+
573
+ Returns:
574
+ List[str]: MD5 hashes that need to be uploaded
575
+ """
576
+ url = f"{cloud_base_url}/api/sync/{dataset.id}/diff"
577
+ headers = {"X-Api-Key": auth_token, "Content-Type": "application/json"}
578
+
579
+ payload = CheckMissingHashesRequest(
580
+ dataset_id=str(dataset.id), dataset_name=dataset.name, hashes=local_hashes
581
+ )
582
+
583
+ logger.info(f"Checking missing hashes on cloud for dataset {dataset.id}")
584
+
585
+ try:
586
+ async with aiohttp.ClientSession() as session:
587
+ async with session.post(url, json=payload.dict(), headers=headers) as response:
588
+ if response.status == 200:
589
+ data = await response.json()
590
+ missing_response = CheckHashesDiffResponse(**data)
591
+ logger.info(
592
+ f"Cloud is missing {len(missing_response.missing_on_remote)} out of {len(local_hashes)} files, local is missing {len(missing_response.missing_on_local)} files"
593
+ )
594
+ return missing_response
595
+ else:
596
+ error_text = await response.text()
597
+ logger.error(
598
+ f"Failed to check missing hashes: Status {response.status} - {error_text}"
599
+ )
600
+ raise ConnectionError(
601
+ f"Failed to check missing hashes: {response.status} - {error_text}"
602
+ )
603
+
604
+ except Exception as e:
605
+ logger.error(f"Error checking missing hashes: {str(e)}")
606
+ raise ConnectionError(f"Failed to check missing hashes: {str(e)}")
607
+
608
+
609
+ async def _download_missing_files(
610
+ cloud_base_url: str,
611
+ auth_token: str,
612
+ dataset: Dataset,
613
+ hashes_missing_on_local: List[str],
614
+ user: User,
615
+ ) -> int:
616
+ """
617
+ Download files that are missing locally from the cloud.
618
+
619
+ Returns:
620
+ int: Total bytes downloaded
621
+ """
622
+ logger.info(f"Downloading {len(hashes_missing_on_local)} missing files from cloud")
623
+
624
+ if not hashes_missing_on_local:
625
+ logger.info("No files need to be downloaded - all files already exist locally")
626
+ return 0
627
+
628
+ total_bytes_downloaded = 0
629
+ downloaded_count = 0
630
+
631
+ headers = {"X-Api-Key": auth_token}
632
+
633
+ async with aiohttp.ClientSession() as session:
634
+ for file_hash in hashes_missing_on_local:
635
+ try:
636
+ # Download file from cloud by hash
637
+ download_url = f"{cloud_base_url}/api/sync/{dataset.id}/data/{file_hash}"
638
+
639
+ logger.debug(f"Downloading file with hash: {file_hash}")
640
+
641
+ async with session.get(download_url, headers=headers) as response:
642
+ if response.status == 200:
643
+ file_content = await response.read()
644
+ file_size = len(file_content)
645
+
646
+ # Get file metadata from response headers
647
+ file_name = response.headers.get("X-File-Name", f"file_{file_hash}")
648
+
649
+ # Save file locally using ingestion pipeline
650
+ await _save_downloaded_file(
651
+ dataset, file_hash, file_name, file_content, user
652
+ )
653
+
654
+ total_bytes_downloaded += file_size
655
+ downloaded_count += 1
656
+
657
+ logger.debug(f"Successfully downloaded {file_name} ({file_size} bytes)")
658
+
659
+ elif response.status == 404:
660
+ logger.warning(f"File with hash {file_hash} not found on cloud")
661
+ continue
662
+ else:
663
+ error_text = await response.text()
664
+ logger.error(
665
+ f"Failed to download file {file_hash}: Status {response.status} - {error_text}"
666
+ )
667
+ continue
668
+
669
+ except Exception as e:
670
+ logger.error(f"Error downloading file {file_hash}: {str(e)}")
671
+ continue
672
+
673
+ logger.info(
674
+ f"Download summary: {downloaded_count}/{len(hashes_missing_on_local)} files downloaded, {total_bytes_downloaded} bytes total"
675
+ )
676
+ return total_bytes_downloaded
677
+
678
+
679
+ class InMemoryDownload:
680
+ def __init__(self, data: bytes, filename: str):
681
+ self.file = io.BufferedReader(io.BytesIO(data))
682
+ self.filename = filename
683
+
684
+
685
+ async def _save_downloaded_file(
686
+ dataset: Dataset,
687
+ file_hash: str,
688
+ file_name: str,
689
+ file_content: bytes,
690
+ user: User,
691
+ ) -> None:
692
+ """
693
+ Save a downloaded file to local storage and register it in the dataset.
694
+ Uses the existing ingest_data function for consistency with normal ingestion.
695
+
696
+ Args:
697
+ dataset: The dataset to add the file to
698
+ file_hash: MD5 hash of the file content
699
+ file_name: Original file name
700
+ file_content: Raw file content bytes
701
+ """
702
+ try:
703
+ # Create a temporary file-like object from the bytes
704
+ file_obj = InMemoryDownload(file_content, file_name)
705
+
706
+ # User is injected as dependency
707
+
708
+ # Use the existing ingest_data function to properly handle the file
709
+ # This ensures consistency with normal file ingestion
710
+ await ingest_data(
711
+ data=file_obj,
712
+ dataset_name=dataset.name,
713
+ user=user,
714
+ dataset_id=dataset.id,
715
+ )
716
+
717
+ logger.debug(f"Successfully saved downloaded file: {file_name} (hash: {file_hash})")
718
+
719
+ except Exception as e:
720
+ logger.error(f"Failed to save downloaded file {file_name}: {str(e)}")
721
+ raise
722
+
723
+
724
+ async def _upload_missing_files(
725
+ cloud_base_url: str,
726
+ auth_token: str,
727
+ dataset: Dataset,
728
+ local_files: List[LocalFileInfo],
729
+ hashes_missing_on_remote: List[str],
730
+ run_id: str,
731
+ ) -> int:
732
+ """
733
+ Upload files that are missing on cloud.
734
+
735
+ Returns:
736
+ int: Total bytes uploaded
737
+ """
738
+ # Filter local files to only those with missing hashes
739
+ files_to_upload = [f for f in local_files if f.content_hash in hashes_missing_on_remote]
740
+
741
+ logger.info(f"Uploading {len(files_to_upload)} missing files to cloud")
742
+
743
+ if not files_to_upload:
744
+ logger.info("No files need to be uploaded - all files already exist on cloud")
745
+ return 0
746
+
747
+ total_bytes_uploaded = 0
748
+ uploaded_count = 0
749
+
750
+ headers = {"X-Api-Key": auth_token}
751
+
752
+ async with aiohttp.ClientSession() as session:
753
+ for file_info in files_to_upload:
754
+ try:
755
+ file_dir = os.path.dirname(file_info.raw_data_location)
756
+ file_name = os.path.basename(file_info.raw_data_location)
757
+ file_storage = get_file_storage(file_dir)
758
+
759
+ async with file_storage.open(file_name, mode="rb") as file:
760
+ file_content = file.read()
761
+
762
+ # Upload file
763
+ url = f"{cloud_base_url}/api/sync/{dataset.id}/data/{file_info.id}"
764
+
765
+ request_data = aiohttp.FormData()
766
+
767
+ request_data.add_field(
768
+ "file", file_content, content_type=file_info.mime_type, filename=file_info.name
769
+ )
770
+ request_data.add_field("dataset_id", str(dataset.id))
771
+ request_data.add_field("dataset_name", dataset.name)
772
+ request_data.add_field("data_id", str(file_info.id))
773
+ request_data.add_field("mime_type", file_info.mime_type)
774
+ request_data.add_field("extension", file_info.extension)
775
+ request_data.add_field("md5", file_info.content_hash)
776
+
777
+ async with session.put(url, data=request_data, headers=headers) as response:
778
+ if response.status in [200, 201]:
779
+ total_bytes_uploaded += len(file_content)
780
+ uploaded_count += 1
781
+ else:
782
+ error_text = await response.text()
783
+ logger.error(
784
+ f"Failed to upload {file_info.name}: Status {response.status} - {error_text}"
785
+ )
786
+ raise ConnectionError(
787
+ f"Upload failed for {file_info.name}: HTTP {response.status} - {error_text}"
788
+ )
789
+
790
+ except Exception as e:
791
+ logger.error(f"Error uploading file {file_info.name}: {str(e)}")
792
+ raise ConnectionError(f"Upload failed for {file_info.name}: {str(e)}")
793
+
794
+ logger.info(f"All {uploaded_count} files uploaded successfully: {total_bytes_uploaded} bytes")
795
+ return total_bytes_uploaded
796
+
797
+
798
+ async def _prune_cloud_dataset(
799
+ cloud_base_url: str, auth_token: str, dataset_id: str, local_hashes: List[str], run_id: str
800
+ ) -> None:
801
+ """
802
+ Prune cloud dataset to match local state.
803
+ """
804
+ url = f"{cloud_base_url}/api/sync/{dataset_id}?prune=true"
805
+ headers = {"X-Api-Key": auth_token, "Content-Type": "application/json"}
806
+
807
+ payload = PruneDatasetRequest(items=local_hashes)
808
+
809
+ logger.info("Pruning cloud dataset to match local state")
810
+
811
+ try:
812
+ async with aiohttp.ClientSession() as session:
813
+ async with session.put(url, json=payload.dict(), headers=headers) as response:
814
+ if response.status == 200:
815
+ data = await response.json()
816
+ deleted_entries = data.get("deleted_database_entries", 0)
817
+ deleted_files = data.get("deleted_files_from_storage", 0)
818
+
819
+ logger.info(
820
+ f"Cloud dataset pruned successfully: {deleted_entries} entries deleted, {deleted_files} files removed"
821
+ )
822
+ else:
823
+ error_text = await response.text()
824
+ logger.error(
825
+ f"Failed to prune cloud dataset: Status {response.status} - {error_text}"
826
+ )
827
+ # Don't raise error for prune failures - sync partially succeeded
828
+
829
+ except Exception as e:
830
+ logger.error(f"Error pruning cloud dataset: {str(e)}")
831
+ # Don't raise error for prune failures - sync partially succeeded
832
+
833
+
834
+ async def _trigger_remote_cognify(
835
+ cloud_base_url: str, auth_token: str, dataset_id: str, run_id: str
836
+ ) -> None:
837
+ """
838
+ Trigger cognify processing on the cloud dataset.
839
+
840
+ This initiates knowledge graph processing on the synchronized dataset
841
+ using the cloud infrastructure.
842
+ """
843
+ url = f"{cloud_base_url}/api/cognify"
844
+ headers = {"X-Api-Key": auth_token, "Content-Type": "application/json"}
845
+
846
+ payload = {
847
+ "dataset_ids": [str(dataset_id)], # Convert UUID to string for JSON serialization
848
+ "run_in_background": False,
849
+ "custom_prompt": "",
850
+ }
851
+
852
+ logger.info(f"Triggering cognify processing for dataset {dataset_id}")
853
+
854
+ try:
855
+ async with aiohttp.ClientSession() as session:
856
+ async with session.post(url, json=payload, headers=headers) as response:
857
+ if response.status == 200:
858
+ data = await response.json()
859
+ logger.info(f"Cognify processing started successfully: {data}")
860
+
861
+ # Extract pipeline run IDs for monitoring if available
862
+ if isinstance(data, dict):
863
+ for dataset_key, run_info in data.items():
864
+ if isinstance(run_info, dict) and "pipeline_run_id" in run_info:
865
+ logger.info(
866
+ f"Cognify pipeline run ID for dataset {dataset_key}: {run_info['pipeline_run_id']}"
867
+ )
868
+ else:
869
+ error_text = await response.text()
870
+ logger.warning(
871
+ f"Failed to trigger cognify processing: Status {response.status} - {error_text}"
872
+ )
873
+ # TODO: consider adding retries
874
+
875
+ except Exception as e:
876
+ logger.warning(f"Error triggering cognify processing: {str(e)}")
877
+ # TODO: consider adding retries