cognee 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +2 -0
- cognee/api/client.py +28 -3
- cognee/api/health.py +10 -13
- cognee/api/v1/add/add.py +3 -1
- cognee/api/v1/add/routers/get_add_router.py +12 -37
- cognee/api/v1/cloud/routers/__init__.py +1 -0
- cognee/api/v1/cloud/routers/get_checks_router.py +23 -0
- cognee/api/v1/cognify/code_graph_pipeline.py +9 -4
- cognee/api/v1/cognify/cognify.py +50 -3
- cognee/api/v1/cognify/routers/get_cognify_router.py +1 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +15 -4
- cognee/api/v1/memify/__init__.py +0 -0
- cognee/api/v1/memify/routers/__init__.py +1 -0
- cognee/api/v1/memify/routers/get_memify_router.py +100 -0
- cognee/api/v1/notebooks/routers/__init__.py +1 -0
- cognee/api/v1/notebooks/routers/get_notebooks_router.py +96 -0
- cognee/api/v1/search/routers/get_search_router.py +20 -1
- cognee/api/v1/search/search.py +11 -4
- cognee/api/v1/sync/__init__.py +17 -0
- cognee/api/v1/sync/routers/__init__.py +3 -0
- cognee/api/v1/sync/routers/get_sync_router.py +241 -0
- cognee/api/v1/sync/sync.py +877 -0
- cognee/api/v1/ui/__init__.py +1 -0
- cognee/api/v1/ui/ui.py +529 -0
- cognee/api/v1/users/routers/get_auth_router.py +13 -1
- cognee/base_config.py +10 -1
- cognee/cli/_cognee.py +93 -0
- cognee/infrastructure/databases/graph/config.py +10 -4
- cognee/infrastructure/databases/graph/kuzu/adapter.py +135 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +89 -0
- cognee/infrastructure/databases/relational/__init__.py +2 -0
- cognee/infrastructure/databases/relational/get_async_session.py +15 -0
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +6 -1
- cognee/infrastructure/databases/relational/with_async_session.py +25 -0
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +1 -1
- cognee/infrastructure/databases/vector/config.py +13 -6
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +2 -6
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +4 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -0
- cognee/infrastructure/files/storage/S3FileStorage.py +5 -0
- cognee/infrastructure/files/storage/StorageManager.py +7 -1
- cognee/infrastructure/files/storage/storage.py +16 -0
- cognee/infrastructure/llm/LLMGateway.py +18 -0
- cognee/infrastructure/llm/config.py +4 -2
- cognee/infrastructure/llm/prompts/extract_query_time.txt +15 -0
- cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +25 -0
- cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +30 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +2 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py +44 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py +1 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py +46 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +25 -1
- cognee/infrastructure/utils/run_sync.py +8 -1
- cognee/modules/chunking/models/DocumentChunk.py +4 -3
- cognee/modules/cloud/exceptions/CloudApiKeyMissingError.py +15 -0
- cognee/modules/cloud/exceptions/CloudConnectionError.py +15 -0
- cognee/modules/cloud/exceptions/__init__.py +2 -0
- cognee/modules/cloud/operations/__init__.py +1 -0
- cognee/modules/cloud/operations/check_api_key.py +25 -0
- cognee/modules/data/deletion/prune_system.py +1 -1
- cognee/modules/data/methods/check_dataset_name.py +1 -1
- cognee/modules/data/methods/get_dataset_data.py +1 -1
- cognee/modules/data/methods/load_or_create_datasets.py +1 -1
- cognee/modules/engine/models/Event.py +16 -0
- cognee/modules/engine/models/Interval.py +8 -0
- cognee/modules/engine/models/Timestamp.py +13 -0
- cognee/modules/engine/models/__init__.py +3 -0
- cognee/modules/engine/utils/__init__.py +2 -0
- cognee/modules/engine/utils/generate_event_datapoint.py +46 -0
- cognee/modules/engine/utils/generate_timestamp_datapoint.py +51 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +2 -2
- cognee/modules/graph/utils/__init__.py +1 -0
- cognee/modules/graph/utils/resolve_edges_to_text.py +71 -0
- cognee/modules/memify/__init__.py +1 -0
- cognee/modules/memify/memify.py +118 -0
- cognee/modules/notebooks/methods/__init__.py +5 -0
- cognee/modules/notebooks/methods/create_notebook.py +26 -0
- cognee/modules/notebooks/methods/delete_notebook.py +13 -0
- cognee/modules/notebooks/methods/get_notebook.py +21 -0
- cognee/modules/notebooks/methods/get_notebooks.py +18 -0
- cognee/modules/notebooks/methods/update_notebook.py +17 -0
- cognee/modules/notebooks/models/Notebook.py +53 -0
- cognee/modules/notebooks/models/__init__.py +1 -0
- cognee/modules/notebooks/operations/__init__.py +1 -0
- cognee/modules/notebooks/operations/run_in_local_sandbox.py +55 -0
- cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +19 -3
- cognee/modules/pipelines/operations/pipeline.py +1 -0
- cognee/modules/pipelines/operations/run_tasks.py +17 -41
- cognee/modules/retrieval/base_graph_retriever.py +18 -0
- cognee/modules/retrieval/base_retriever.py +1 -1
- cognee/modules/retrieval/code_retriever.py +8 -0
- cognee/modules/retrieval/coding_rules_retriever.py +31 -0
- cognee/modules/retrieval/completion_retriever.py +9 -3
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -0
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +23 -14
- cognee/modules/retrieval/graph_completion_cot_retriever.py +21 -11
- cognee/modules/retrieval/graph_completion_retriever.py +32 -65
- cognee/modules/retrieval/graph_summary_completion_retriever.py +3 -1
- cognee/modules/retrieval/insights_retriever.py +14 -3
- cognee/modules/retrieval/summaries_retriever.py +1 -1
- cognee/modules/retrieval/temporal_retriever.py +152 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +7 -32
- cognee/modules/retrieval/utils/completion.py +10 -3
- cognee/modules/search/methods/get_search_type_tools.py +168 -0
- cognee/modules/search/methods/no_access_control_search.py +47 -0
- cognee/modules/search/methods/search.py +219 -139
- cognee/modules/search/types/SearchResult.py +21 -0
- cognee/modules/search/types/SearchType.py +2 -0
- cognee/modules/search/types/__init__.py +1 -0
- cognee/modules/search/utils/__init__.py +2 -0
- cognee/modules/search/utils/prepare_search_result.py +41 -0
- cognee/modules/search/utils/transform_context_to_graph.py +38 -0
- cognee/modules/sync/__init__.py +1 -0
- cognee/modules/sync/methods/__init__.py +23 -0
- cognee/modules/sync/methods/create_sync_operation.py +53 -0
- cognee/modules/sync/methods/get_sync_operation.py +107 -0
- cognee/modules/sync/methods/update_sync_operation.py +248 -0
- cognee/modules/sync/models/SyncOperation.py +142 -0
- cognee/modules/sync/models/__init__.py +3 -0
- cognee/modules/users/__init__.py +0 -1
- cognee/modules/users/methods/__init__.py +4 -1
- cognee/modules/users/methods/create_user.py +26 -1
- cognee/modules/users/methods/get_authenticated_user.py +36 -42
- cognee/modules/users/methods/get_default_user.py +3 -1
- cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +2 -1
- cognee/root_dir.py +19 -0
- cognee/shared/logging_utils.py +1 -1
- cognee/tasks/codingagents/__init__.py +0 -0
- cognee/tasks/codingagents/coding_rule_associations.py +127 -0
- cognee/tasks/ingestion/save_data_item_to_storage.py +23 -0
- cognee/tasks/memify/__init__.py +2 -0
- cognee/tasks/memify/extract_subgraph.py +7 -0
- cognee/tasks/memify/extract_subgraph_chunks.py +11 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +52 -27
- cognee/tasks/temporal_graph/__init__.py +1 -0
- cognee/tasks/temporal_graph/add_entities_to_event.py +85 -0
- cognee/tasks/temporal_graph/enrich_events.py +34 -0
- cognee/tasks/temporal_graph/extract_events_and_entities.py +32 -0
- cognee/tasks/temporal_graph/extract_knowledge_graph_from_events.py +41 -0
- cognee/tasks/temporal_graph/models.py +49 -0
- cognee/tests/test_kuzu.py +4 -4
- cognee/tests/test_neo4j.py +4 -4
- cognee/tests/test_permissions.py +3 -3
- cognee/tests/test_relational_db_migration.py +7 -5
- cognee/tests/test_search_db.py +18 -24
- cognee/tests/test_temporal_graph.py +167 -0
- cognee/tests/unit/api/__init__.py +1 -0
- cognee/tests/unit/api/test_conditional_authentication_endpoints.py +246 -0
- cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +18 -2
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +13 -16
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +11 -16
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +5 -4
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +4 -2
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +18 -2
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +225 -0
- cognee/tests/unit/modules/users/__init__.py +1 -0
- cognee/tests/unit/modules/users/test_conditional_authentication.py +277 -0
- cognee/tests/unit/processing/utils/utils_test.py +20 -1
- {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/METADATA +8 -6
- {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/RECORD +165 -90
- cognee/tests/unit/modules/search/search_methods_test.py +0 -225
- {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/WHEEL +0 -0
- {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/entry_points.txt +0 -0
- {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,877 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
import uuid
|
|
4
|
+
import asyncio
|
|
5
|
+
import aiohttp
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
from cognee.api.v1.cognify import cognify
|
|
12
|
+
|
|
13
|
+
from cognee.infrastructure.files.storage import get_file_storage
|
|
14
|
+
from cognee.tasks.ingestion.ingest_data import ingest_data
|
|
15
|
+
from cognee.shared.logging_utils import get_logger
|
|
16
|
+
from cognee.modules.users.models import User
|
|
17
|
+
from cognee.modules.data.models import Dataset
|
|
18
|
+
from cognee.modules.data.methods import get_dataset_data
|
|
19
|
+
from cognee.modules.sync.methods import (
|
|
20
|
+
create_sync_operation,
|
|
21
|
+
update_sync_operation,
|
|
22
|
+
mark_sync_started,
|
|
23
|
+
mark_sync_completed,
|
|
24
|
+
mark_sync_failed,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = get_logger("sync")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def _safe_update_progress(run_id: str, stage: str, **kwargs):
|
|
31
|
+
"""
|
|
32
|
+
Safely update sync progress with better error handling and context.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
run_id: Sync operation run ID
|
|
36
|
+
progress_percentage: Progress percentage (0-100)
|
|
37
|
+
stage: Description of current stage for logging
|
|
38
|
+
**kwargs: Additional fields to update (records_downloaded, records_uploaded, etc.)
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
await update_sync_operation(run_id, **kwargs)
|
|
42
|
+
logger.info(f"Sync {run_id}: Progress updated during {stage}")
|
|
43
|
+
except Exception as e:
|
|
44
|
+
# Log error but don't fail the sync - progress updates are nice-to-have
|
|
45
|
+
logger.warning(
|
|
46
|
+
f"Sync {run_id}: Non-critical progress update failed during {stage}: {str(e)}"
|
|
47
|
+
)
|
|
48
|
+
# Continue without raising - sync operation is more important than progress tracking
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class LocalFileInfo(BaseModel):
|
|
52
|
+
"""Model for local file information with hash."""
|
|
53
|
+
|
|
54
|
+
id: str
|
|
55
|
+
name: str
|
|
56
|
+
mime_type: Optional[str]
|
|
57
|
+
extension: Optional[str]
|
|
58
|
+
raw_data_location: str
|
|
59
|
+
content_hash: str # MD5 hash
|
|
60
|
+
file_size: int
|
|
61
|
+
node_set: Optional[str] = None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class CheckMissingHashesRequest(BaseModel):
|
|
65
|
+
"""Request model for checking missing hashes in a dataset"""
|
|
66
|
+
|
|
67
|
+
dataset_id: str
|
|
68
|
+
dataset_name: str
|
|
69
|
+
hashes: List[str]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class CheckHashesDiffResponse(BaseModel):
|
|
73
|
+
"""Response model for missing hashes check"""
|
|
74
|
+
|
|
75
|
+
missing_on_remote: List[str]
|
|
76
|
+
missing_on_local: List[str]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class PruneDatasetRequest(BaseModel):
|
|
80
|
+
"""Request model for pruning dataset to specific hashes"""
|
|
81
|
+
|
|
82
|
+
items: List[str]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class SyncResponse(BaseModel):
|
|
86
|
+
"""Response model for sync operations."""
|
|
87
|
+
|
|
88
|
+
run_id: str
|
|
89
|
+
status: str # "started" for immediate response
|
|
90
|
+
dataset_ids: List[str]
|
|
91
|
+
dataset_names: List[str]
|
|
92
|
+
message: str
|
|
93
|
+
timestamp: str
|
|
94
|
+
user_id: str
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def sync(
|
|
98
|
+
datasets: List[Dataset],
|
|
99
|
+
user: User,
|
|
100
|
+
) -> SyncResponse:
|
|
101
|
+
"""
|
|
102
|
+
Sync local Cognee data to Cognee Cloud.
|
|
103
|
+
|
|
104
|
+
This function handles synchronization of multiple datasets, knowledge graphs, and
|
|
105
|
+
processed data to the Cognee Cloud infrastructure. It uploads local data for
|
|
106
|
+
cloud-based processing, backup, and sharing.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
datasets: List of Dataset objects to sync (permissions already verified)
|
|
110
|
+
user: User object for authentication and permissions
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
SyncResponse model with immediate response:
|
|
114
|
+
- run_id: Unique identifier for tracking this sync operation
|
|
115
|
+
- status: Always "started" (sync runs in background)
|
|
116
|
+
- dataset_ids: List of dataset IDs being synced
|
|
117
|
+
- dataset_names: List of dataset names being synced
|
|
118
|
+
- message: Description of what's happening
|
|
119
|
+
- timestamp: When the sync was initiated
|
|
120
|
+
- user_id: User who initiated the sync
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
ConnectionError: If Cognee Cloud service is unreachable
|
|
124
|
+
Exception: For other sync-related errors
|
|
125
|
+
"""
|
|
126
|
+
if not datasets:
|
|
127
|
+
raise ValueError("At least one dataset must be provided for sync operation")
|
|
128
|
+
|
|
129
|
+
# Generate a unique run ID
|
|
130
|
+
run_id = str(uuid.uuid4())
|
|
131
|
+
|
|
132
|
+
# Get current timestamp
|
|
133
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
134
|
+
|
|
135
|
+
dataset_info = ", ".join([f"{d.name} ({d.id})" for d in datasets])
|
|
136
|
+
logger.info(f"Starting cloud sync operation {run_id}: datasets {dataset_info}")
|
|
137
|
+
|
|
138
|
+
# Create sync operation record in database (total_records will be set during background sync)
|
|
139
|
+
try:
|
|
140
|
+
await create_sync_operation(
|
|
141
|
+
run_id=run_id,
|
|
142
|
+
dataset_ids=[d.id for d in datasets],
|
|
143
|
+
dataset_names=[d.name for d in datasets],
|
|
144
|
+
user_id=user.id,
|
|
145
|
+
)
|
|
146
|
+
logger.info(f"Created sync operation record for {run_id}")
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"Failed to create sync operation record: {str(e)}")
|
|
149
|
+
# Continue without database tracking if record creation fails
|
|
150
|
+
|
|
151
|
+
# Start the sync operation in the background
|
|
152
|
+
asyncio.create_task(_perform_background_sync(run_id, datasets, user))
|
|
153
|
+
|
|
154
|
+
# Return immediately with run_id
|
|
155
|
+
return SyncResponse(
|
|
156
|
+
run_id=run_id,
|
|
157
|
+
status="started",
|
|
158
|
+
dataset_ids=[str(d.id) for d in datasets],
|
|
159
|
+
dataset_names=[d.name for d in datasets],
|
|
160
|
+
message=f"Sync operation started in background for {len(datasets)} datasets. Use run_id '{run_id}' to track progress.",
|
|
161
|
+
timestamp=timestamp,
|
|
162
|
+
user_id=str(user.id),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
async def _perform_background_sync(run_id: str, datasets: List[Dataset], user: User) -> None:
|
|
167
|
+
"""Perform the actual sync operation in the background for multiple datasets."""
|
|
168
|
+
start_time = datetime.now(timezone.utc)
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
dataset_info = ", ".join([f"{d.name} ({d.id})" for d in datasets])
|
|
172
|
+
logger.info(f"Background sync {run_id}: Starting sync for datasets {dataset_info}")
|
|
173
|
+
|
|
174
|
+
# Mark sync as in progress
|
|
175
|
+
await mark_sync_started(run_id)
|
|
176
|
+
|
|
177
|
+
# Perform the actual sync operation
|
|
178
|
+
MAX_RETRY_COUNT = 3
|
|
179
|
+
retry_count = 0
|
|
180
|
+
while retry_count < MAX_RETRY_COUNT:
|
|
181
|
+
try:
|
|
182
|
+
(
|
|
183
|
+
records_downloaded,
|
|
184
|
+
records_uploaded,
|
|
185
|
+
bytes_downloaded,
|
|
186
|
+
bytes_uploaded,
|
|
187
|
+
dataset_sync_hashes,
|
|
188
|
+
) = await _sync_to_cognee_cloud(datasets, user, run_id)
|
|
189
|
+
break
|
|
190
|
+
except Exception as e:
|
|
191
|
+
retry_count += 1
|
|
192
|
+
logger.error(
|
|
193
|
+
f"Background sync {run_id}: Failed after {retry_count} retries with error: {str(e)}"
|
|
194
|
+
)
|
|
195
|
+
await update_sync_operation(run_id, retry_count=retry_count)
|
|
196
|
+
await asyncio.sleep(2**retry_count)
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
if retry_count == MAX_RETRY_COUNT:
|
|
200
|
+
logger.error(f"Background sync {run_id}: Failed after {MAX_RETRY_COUNT} retries")
|
|
201
|
+
await mark_sync_failed(run_id, "Failed after 3 retries")
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
end_time = datetime.now(timezone.utc)
|
|
205
|
+
duration = (end_time - start_time).total_seconds()
|
|
206
|
+
|
|
207
|
+
logger.info(
|
|
208
|
+
f"Background sync {run_id}: Completed successfully. Downloaded: {records_downloaded} records/{bytes_downloaded} bytes, Uploaded: {records_uploaded} records/{bytes_uploaded} bytes, Duration: {duration}s"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Mark sync as completed with final stats and data lineage
|
|
212
|
+
await mark_sync_completed(
|
|
213
|
+
run_id,
|
|
214
|
+
records_downloaded,
|
|
215
|
+
records_uploaded,
|
|
216
|
+
bytes_downloaded,
|
|
217
|
+
bytes_uploaded,
|
|
218
|
+
dataset_sync_hashes,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
except Exception as e:
|
|
222
|
+
end_time = datetime.now(timezone.utc)
|
|
223
|
+
duration = (end_time - start_time).total_seconds()
|
|
224
|
+
|
|
225
|
+
logger.error(f"Background sync {run_id}: Failed after {duration}s with error: {str(e)}")
|
|
226
|
+
|
|
227
|
+
# Mark sync as failed with error message
|
|
228
|
+
await mark_sync_failed(run_id, str(e))
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def _sync_to_cognee_cloud(
|
|
232
|
+
datasets: List[Dataset], user: User, run_id: str
|
|
233
|
+
) -> tuple[int, int, int, int, dict]:
|
|
234
|
+
"""
|
|
235
|
+
Sync local data to Cognee Cloud using three-step idempotent process:
|
|
236
|
+
1. Extract local files with stored MD5 hashes and check what's missing on cloud
|
|
237
|
+
2. Upload missing files individually
|
|
238
|
+
3. Prune cloud dataset to match local state
|
|
239
|
+
"""
|
|
240
|
+
dataset_info = ", ".join([f"{d.name} ({d.id})" for d in datasets])
|
|
241
|
+
logger.info(f"Starting sync to Cognee Cloud: datasets {dataset_info}")
|
|
242
|
+
|
|
243
|
+
total_records_downloaded = 0
|
|
244
|
+
total_records_uploaded = 0
|
|
245
|
+
total_bytes_downloaded = 0
|
|
246
|
+
total_bytes_uploaded = 0
|
|
247
|
+
dataset_sync_hashes = {}
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
# Get cloud configuration
|
|
251
|
+
cloud_base_url = await _get_cloud_base_url()
|
|
252
|
+
cloud_auth_token = await _get_cloud_auth_token(user)
|
|
253
|
+
|
|
254
|
+
# Step 1: Sync files for all datasets concurrently
|
|
255
|
+
sync_files_tasks = [
|
|
256
|
+
_sync_dataset_files(dataset, cloud_base_url, cloud_auth_token, user, run_id)
|
|
257
|
+
for dataset in datasets
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
logger.info(f"Starting concurrent file sync for {len(datasets)} datasets")
|
|
261
|
+
|
|
262
|
+
has_any_uploads = False
|
|
263
|
+
has_any_downloads = False
|
|
264
|
+
processed_datasets = []
|
|
265
|
+
completed_datasets = 0
|
|
266
|
+
|
|
267
|
+
# Process datasets concurrently and accumulate results
|
|
268
|
+
for completed_task in asyncio.as_completed(sync_files_tasks):
|
|
269
|
+
try:
|
|
270
|
+
dataset_result = await completed_task
|
|
271
|
+
completed_datasets += 1
|
|
272
|
+
|
|
273
|
+
# Update progress based on completed datasets (0-80% for file sync)
|
|
274
|
+
file_sync_progress = int((completed_datasets / len(datasets)) * 80)
|
|
275
|
+
await _safe_update_progress(
|
|
276
|
+
run_id, "file_sync", progress_percentage=file_sync_progress
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
if dataset_result is None:
|
|
280
|
+
logger.info(
|
|
281
|
+
f"Progress: {completed_datasets}/{len(datasets)} datasets processed ({file_sync_progress}%)"
|
|
282
|
+
)
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
total_records_downloaded += dataset_result.records_downloaded
|
|
286
|
+
total_records_uploaded += dataset_result.records_uploaded
|
|
287
|
+
total_bytes_downloaded += dataset_result.bytes_downloaded
|
|
288
|
+
total_bytes_uploaded += dataset_result.bytes_uploaded
|
|
289
|
+
|
|
290
|
+
# Build per-dataset hash tracking for data lineage
|
|
291
|
+
dataset_sync_hashes[dataset_result.dataset_id] = {
|
|
292
|
+
"uploaded": dataset_result.uploaded_hashes,
|
|
293
|
+
"downloaded": dataset_result.downloaded_hashes,
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if dataset_result.has_uploads:
|
|
297
|
+
has_any_uploads = True
|
|
298
|
+
if dataset_result.has_downloads:
|
|
299
|
+
has_any_downloads = True
|
|
300
|
+
|
|
301
|
+
processed_datasets.append(dataset_result.dataset_id)
|
|
302
|
+
|
|
303
|
+
logger.info(
|
|
304
|
+
f"Progress: {completed_datasets}/{len(datasets)} datasets processed ({file_sync_progress}%) - "
|
|
305
|
+
f"Completed file sync for dataset {dataset_result.dataset_name}: "
|
|
306
|
+
f"↑{dataset_result.records_uploaded} files ({dataset_result.bytes_uploaded} bytes), "
|
|
307
|
+
f"↓{dataset_result.records_downloaded} files ({dataset_result.bytes_downloaded} bytes)"
|
|
308
|
+
)
|
|
309
|
+
except Exception as e:
|
|
310
|
+
completed_datasets += 1
|
|
311
|
+
logger.error(f"Dataset file sync failed: {str(e)}")
|
|
312
|
+
# Update progress even for failed datasets
|
|
313
|
+
file_sync_progress = int((completed_datasets / len(datasets)) * 80)
|
|
314
|
+
await _safe_update_progress(
|
|
315
|
+
run_id, "file_sync", progress_percentage=file_sync_progress
|
|
316
|
+
)
|
|
317
|
+
# Continue with other datasets even if one fails
|
|
318
|
+
|
|
319
|
+
# Step 2: Trigger cognify processing once for all datasets (only if any files were uploaded)
|
|
320
|
+
# Update progress to 90% before cognify
|
|
321
|
+
await _safe_update_progress(run_id, "cognify", progress_percentage=90)
|
|
322
|
+
|
|
323
|
+
if has_any_uploads and processed_datasets:
|
|
324
|
+
logger.info(
|
|
325
|
+
f"Progress: 90% - Triggering cognify processing for {len(processed_datasets)} datasets with new files"
|
|
326
|
+
)
|
|
327
|
+
try:
|
|
328
|
+
# Trigger cognify for all datasets at once - use first dataset as reference point
|
|
329
|
+
await _trigger_remote_cognify(
|
|
330
|
+
cloud_base_url, cloud_auth_token, datasets[0].id, run_id
|
|
331
|
+
)
|
|
332
|
+
logger.info("Cognify processing triggered successfully for all datasets")
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.warning(f"Failed to trigger cognify processing: {str(e)}")
|
|
335
|
+
# Don't fail the entire sync if cognify fails
|
|
336
|
+
else:
|
|
337
|
+
logger.info(
|
|
338
|
+
"Progress: 90% - Skipping cognify processing - no new files were uploaded across any datasets"
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Step 3: Trigger local cognify processing if any files were downloaded
|
|
342
|
+
if has_any_downloads and processed_datasets:
|
|
343
|
+
logger.info(
|
|
344
|
+
f"Progress: 95% - Triggering local cognify processing for {len(processed_datasets)} datasets with downloaded files"
|
|
345
|
+
)
|
|
346
|
+
try:
|
|
347
|
+
await cognify()
|
|
348
|
+
logger.info("Local cognify processing completed successfully for all datasets")
|
|
349
|
+
except Exception as e:
|
|
350
|
+
logger.warning(f"Failed to run local cognify processing: {str(e)}")
|
|
351
|
+
# Don't fail the entire sync if local cognify fails
|
|
352
|
+
else:
|
|
353
|
+
logger.info(
|
|
354
|
+
"Progress: 95% - Skipping local cognify processing - no new files were downloaded across any datasets"
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Update final progress
|
|
358
|
+
try:
|
|
359
|
+
await _safe_update_progress(
|
|
360
|
+
run_id,
|
|
361
|
+
"final",
|
|
362
|
+
progress_percentage=100,
|
|
363
|
+
total_records_to_sync=total_records_uploaded + total_records_downloaded,
|
|
364
|
+
total_records_to_download=total_records_downloaded,
|
|
365
|
+
total_records_to_upload=total_records_uploaded,
|
|
366
|
+
records_downloaded=total_records_downloaded,
|
|
367
|
+
records_uploaded=total_records_uploaded,
|
|
368
|
+
)
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.warning(f"Failed to update final sync progress: {str(e)}")
|
|
371
|
+
|
|
372
|
+
logger.info(
|
|
373
|
+
f"Multi-dataset sync completed: {len(datasets)} datasets processed, downloaded {total_records_downloaded} records/{total_bytes_downloaded} bytes, uploaded {total_records_uploaded} records/{total_bytes_uploaded} bytes"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
return (
|
|
377
|
+
total_records_downloaded,
|
|
378
|
+
total_records_uploaded,
|
|
379
|
+
total_bytes_downloaded,
|
|
380
|
+
total_bytes_uploaded,
|
|
381
|
+
dataset_sync_hashes,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
except Exception as e:
|
|
385
|
+
logger.error(f"Sync failed: {str(e)}")
|
|
386
|
+
raise ConnectionError(f"Cloud sync failed: {str(e)}")
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
@dataclass
|
|
390
|
+
class DatasetSyncResult:
|
|
391
|
+
"""Result of syncing files for a single dataset."""
|
|
392
|
+
|
|
393
|
+
dataset_name: str
|
|
394
|
+
dataset_id: str
|
|
395
|
+
records_downloaded: int
|
|
396
|
+
records_uploaded: int
|
|
397
|
+
bytes_downloaded: int
|
|
398
|
+
bytes_uploaded: int
|
|
399
|
+
has_uploads: bool # Whether any files were uploaded (for cognify decision)
|
|
400
|
+
has_downloads: bool # Whether any files were downloaded (for cognify decision)
|
|
401
|
+
uploaded_hashes: List[str] # Content hashes of files uploaded during sync
|
|
402
|
+
downloaded_hashes: List[str] # Content hashes of files downloaded during sync
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
async def _sync_dataset_files(
|
|
406
|
+
dataset: Dataset, cloud_base_url: str, cloud_auth_token: str, user: User, run_id: str
|
|
407
|
+
) -> Optional[DatasetSyncResult]:
|
|
408
|
+
"""
|
|
409
|
+
Sync files for a single dataset (2-way: upload to cloud, download from cloud).
|
|
410
|
+
Does NOT trigger cognify - that's done separately once for all datasets.
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
DatasetSyncResult with sync results or None if dataset was empty
|
|
414
|
+
"""
|
|
415
|
+
logger.info(f"Syncing files for dataset: {dataset.name} ({dataset.id})")
|
|
416
|
+
|
|
417
|
+
try:
|
|
418
|
+
# Step 1: Extract local file info with stored hashes
|
|
419
|
+
local_files = await _extract_local_files_with_hashes(dataset, user, run_id)
|
|
420
|
+
logger.info(f"Found {len(local_files)} local files for dataset {dataset.name}")
|
|
421
|
+
|
|
422
|
+
if not local_files:
|
|
423
|
+
logger.info(f"No files to sync for dataset {dataset.name} - skipping")
|
|
424
|
+
return None
|
|
425
|
+
|
|
426
|
+
# Step 2: Check what files are missing on cloud
|
|
427
|
+
local_hashes = [f.content_hash for f in local_files]
|
|
428
|
+
hashes_diff_response = await _check_hashes_diff(
|
|
429
|
+
cloud_base_url, cloud_auth_token, dataset, local_hashes, run_id
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
hashes_missing_on_remote = hashes_diff_response.missing_on_remote
|
|
433
|
+
hashes_missing_on_local = hashes_diff_response.missing_on_local
|
|
434
|
+
|
|
435
|
+
logger.info(
|
|
436
|
+
f"Dataset {dataset.name}: {len(hashes_missing_on_remote)} files to upload, {len(hashes_missing_on_local)} files to download"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Step 3: Upload files that are missing on cloud
|
|
440
|
+
bytes_uploaded = await _upload_missing_files(
|
|
441
|
+
cloud_base_url, cloud_auth_token, dataset, local_files, hashes_missing_on_remote, run_id
|
|
442
|
+
)
|
|
443
|
+
logger.info(
|
|
444
|
+
f"Dataset {dataset.name}: Upload complete - {len(hashes_missing_on_remote)} files, {bytes_uploaded} bytes"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Step 4: Download files that are missing locally
|
|
448
|
+
bytes_downloaded = await _download_missing_files(
|
|
449
|
+
cloud_base_url, cloud_auth_token, dataset, hashes_missing_on_local, user
|
|
450
|
+
)
|
|
451
|
+
logger.info(
|
|
452
|
+
f"Dataset {dataset.name}: Download complete - {len(hashes_missing_on_local)} files, {bytes_downloaded} bytes"
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
return DatasetSyncResult(
|
|
456
|
+
dataset_name=dataset.name,
|
|
457
|
+
dataset_id=str(dataset.id),
|
|
458
|
+
records_downloaded=len(hashes_missing_on_local),
|
|
459
|
+
records_uploaded=len(hashes_missing_on_remote),
|
|
460
|
+
bytes_downloaded=bytes_downloaded,
|
|
461
|
+
bytes_uploaded=bytes_uploaded,
|
|
462
|
+
has_uploads=len(hashes_missing_on_remote) > 0,
|
|
463
|
+
has_downloads=len(hashes_missing_on_local) > 0,
|
|
464
|
+
uploaded_hashes=hashes_missing_on_remote,
|
|
465
|
+
downloaded_hashes=hashes_missing_on_local,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.error(f"Failed to sync files for dataset {dataset.name} ({dataset.id}): {str(e)}")
|
|
470
|
+
raise # Re-raise to be handled by the caller
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
async def _extract_local_files_with_hashes(
|
|
474
|
+
dataset: Dataset, user: User, run_id: str
|
|
475
|
+
) -> List[LocalFileInfo]:
|
|
476
|
+
"""
|
|
477
|
+
Extract local dataset data with existing MD5 hashes from database.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
dataset: Dataset to extract files from
|
|
481
|
+
user: User performing the sync
|
|
482
|
+
run_id: Unique identifier for this sync operation
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
List[LocalFileInfo]: Information about each local file with stored hash
|
|
486
|
+
"""
|
|
487
|
+
try:
|
|
488
|
+
logger.info(f"Extracting files from dataset: {dataset.name} ({dataset.id})")
|
|
489
|
+
|
|
490
|
+
# Get all data entries linked to this dataset
|
|
491
|
+
data_entries = await get_dataset_data(dataset.id)
|
|
492
|
+
logger.info(f"Found {len(data_entries)} data entries in dataset")
|
|
493
|
+
|
|
494
|
+
# Process each data entry to get file info and hash
|
|
495
|
+
local_files: List[LocalFileInfo] = []
|
|
496
|
+
skipped_count = 0
|
|
497
|
+
|
|
498
|
+
for data_entry in data_entries:
|
|
499
|
+
try:
|
|
500
|
+
# Use existing content_hash from database
|
|
501
|
+
content_hash = data_entry.raw_content_hash
|
|
502
|
+
file_size = data_entry.data_size if data_entry.data_size else 0
|
|
503
|
+
|
|
504
|
+
# Skip entries without content hash (shouldn't happen in normal cases)
|
|
505
|
+
if not content_hash:
|
|
506
|
+
skipped_count += 1
|
|
507
|
+
logger.warning(
|
|
508
|
+
f"Skipping file {data_entry.name}: missing content_hash in database"
|
|
509
|
+
)
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
if file_size == 0:
|
|
513
|
+
# Get file size from filesystem if not stored
|
|
514
|
+
file_size = await _get_file_size(data_entry.raw_data_location)
|
|
515
|
+
|
|
516
|
+
local_files.append(
|
|
517
|
+
LocalFileInfo(
|
|
518
|
+
id=str(data_entry.id),
|
|
519
|
+
name=data_entry.name,
|
|
520
|
+
mime_type=data_entry.mime_type,
|
|
521
|
+
extension=data_entry.extension,
|
|
522
|
+
raw_data_location=data_entry.raw_data_location,
|
|
523
|
+
content_hash=content_hash,
|
|
524
|
+
file_size=file_size,
|
|
525
|
+
node_set=data_entry.node_set,
|
|
526
|
+
)
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
except Exception as e:
|
|
530
|
+
skipped_count += 1
|
|
531
|
+
logger.warning(f"Failed to process file {data_entry.name}: {str(e)}")
|
|
532
|
+
# Continue with other entries even if one fails
|
|
533
|
+
continue
|
|
534
|
+
|
|
535
|
+
logger.info(
|
|
536
|
+
f"File extraction complete: {len(local_files)} files processed, {skipped_count} skipped"
|
|
537
|
+
)
|
|
538
|
+
return local_files
|
|
539
|
+
|
|
540
|
+
except Exception as e:
|
|
541
|
+
logger.error(f"Failed to extract files from dataset {dataset.name}: {str(e)}")
|
|
542
|
+
raise
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
async def _get_file_size(file_path: str) -> int:
|
|
546
|
+
"""Get file size in bytes."""
|
|
547
|
+
try:
|
|
548
|
+
file_dir = os.path.dirname(file_path)
|
|
549
|
+
file_name = os.path.basename(file_path)
|
|
550
|
+
file_storage = get_file_storage(file_dir)
|
|
551
|
+
|
|
552
|
+
return await file_storage.get_size(file_name)
|
|
553
|
+
except Exception:
|
|
554
|
+
return 0
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
async def _get_cloud_base_url() -> str:
|
|
558
|
+
"""Get Cognee Cloud API base URL."""
|
|
559
|
+
return os.getenv("COGNEE_CLOUD_API_URL", "http://localhost:8001")
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
async def _get_cloud_auth_token(user: User) -> str:
|
|
563
|
+
"""Get authentication token for Cognee Cloud API."""
|
|
564
|
+
return os.getenv("COGNEE_CLOUD_AUTH_TOKEN", "your-auth-token")
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
async def _check_hashes_diff(
|
|
568
|
+
cloud_base_url: str, auth_token: str, dataset: Dataset, local_hashes: List[str], run_id: str
|
|
569
|
+
) -> CheckHashesDiffResponse:
|
|
570
|
+
"""
|
|
571
|
+
Check which hashes are missing on cloud.
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
List[str]: MD5 hashes that need to be uploaded
|
|
575
|
+
"""
|
|
576
|
+
url = f"{cloud_base_url}/api/sync/{dataset.id}/diff"
|
|
577
|
+
headers = {"X-Api-Key": auth_token, "Content-Type": "application/json"}
|
|
578
|
+
|
|
579
|
+
payload = CheckMissingHashesRequest(
|
|
580
|
+
dataset_id=str(dataset.id), dataset_name=dataset.name, hashes=local_hashes
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
logger.info(f"Checking missing hashes on cloud for dataset {dataset.id}")
|
|
584
|
+
|
|
585
|
+
try:
|
|
586
|
+
async with aiohttp.ClientSession() as session:
|
|
587
|
+
async with session.post(url, json=payload.dict(), headers=headers) as response:
|
|
588
|
+
if response.status == 200:
|
|
589
|
+
data = await response.json()
|
|
590
|
+
missing_response = CheckHashesDiffResponse(**data)
|
|
591
|
+
logger.info(
|
|
592
|
+
f"Cloud is missing {len(missing_response.missing_on_remote)} out of {len(local_hashes)} files, local is missing {len(missing_response.missing_on_local)} files"
|
|
593
|
+
)
|
|
594
|
+
return missing_response
|
|
595
|
+
else:
|
|
596
|
+
error_text = await response.text()
|
|
597
|
+
logger.error(
|
|
598
|
+
f"Failed to check missing hashes: Status {response.status} - {error_text}"
|
|
599
|
+
)
|
|
600
|
+
raise ConnectionError(
|
|
601
|
+
f"Failed to check missing hashes: {response.status} - {error_text}"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
except Exception as e:
|
|
605
|
+
logger.error(f"Error checking missing hashes: {str(e)}")
|
|
606
|
+
raise ConnectionError(f"Failed to check missing hashes: {str(e)}")
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
async def _download_missing_files(
|
|
610
|
+
cloud_base_url: str,
|
|
611
|
+
auth_token: str,
|
|
612
|
+
dataset: Dataset,
|
|
613
|
+
hashes_missing_on_local: List[str],
|
|
614
|
+
user: User,
|
|
615
|
+
) -> int:
|
|
616
|
+
"""
|
|
617
|
+
Download files that are missing locally from the cloud.
|
|
618
|
+
|
|
619
|
+
Returns:
|
|
620
|
+
int: Total bytes downloaded
|
|
621
|
+
"""
|
|
622
|
+
logger.info(f"Downloading {len(hashes_missing_on_local)} missing files from cloud")
|
|
623
|
+
|
|
624
|
+
if not hashes_missing_on_local:
|
|
625
|
+
logger.info("No files need to be downloaded - all files already exist locally")
|
|
626
|
+
return 0
|
|
627
|
+
|
|
628
|
+
total_bytes_downloaded = 0
|
|
629
|
+
downloaded_count = 0
|
|
630
|
+
|
|
631
|
+
headers = {"X-Api-Key": auth_token}
|
|
632
|
+
|
|
633
|
+
async with aiohttp.ClientSession() as session:
|
|
634
|
+
for file_hash in hashes_missing_on_local:
|
|
635
|
+
try:
|
|
636
|
+
# Download file from cloud by hash
|
|
637
|
+
download_url = f"{cloud_base_url}/api/sync/{dataset.id}/data/{file_hash}"
|
|
638
|
+
|
|
639
|
+
logger.debug(f"Downloading file with hash: {file_hash}")
|
|
640
|
+
|
|
641
|
+
async with session.get(download_url, headers=headers) as response:
|
|
642
|
+
if response.status == 200:
|
|
643
|
+
file_content = await response.read()
|
|
644
|
+
file_size = len(file_content)
|
|
645
|
+
|
|
646
|
+
# Get file metadata from response headers
|
|
647
|
+
file_name = response.headers.get("X-File-Name", f"file_{file_hash}")
|
|
648
|
+
|
|
649
|
+
# Save file locally using ingestion pipeline
|
|
650
|
+
await _save_downloaded_file(
|
|
651
|
+
dataset, file_hash, file_name, file_content, user
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
total_bytes_downloaded += file_size
|
|
655
|
+
downloaded_count += 1
|
|
656
|
+
|
|
657
|
+
logger.debug(f"Successfully downloaded {file_name} ({file_size} bytes)")
|
|
658
|
+
|
|
659
|
+
elif response.status == 404:
|
|
660
|
+
logger.warning(f"File with hash {file_hash} not found on cloud")
|
|
661
|
+
continue
|
|
662
|
+
else:
|
|
663
|
+
error_text = await response.text()
|
|
664
|
+
logger.error(
|
|
665
|
+
f"Failed to download file {file_hash}: Status {response.status} - {error_text}"
|
|
666
|
+
)
|
|
667
|
+
continue
|
|
668
|
+
|
|
669
|
+
except Exception as e:
|
|
670
|
+
logger.error(f"Error downloading file {file_hash}: {str(e)}")
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
logger.info(
|
|
674
|
+
f"Download summary: {downloaded_count}/{len(hashes_missing_on_local)} files downloaded, {total_bytes_downloaded} bytes total"
|
|
675
|
+
)
|
|
676
|
+
return total_bytes_downloaded
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
class InMemoryDownload:
|
|
680
|
+
def __init__(self, data: bytes, filename: str):
|
|
681
|
+
self.file = io.BufferedReader(io.BytesIO(data))
|
|
682
|
+
self.filename = filename
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
async def _save_downloaded_file(
|
|
686
|
+
dataset: Dataset,
|
|
687
|
+
file_hash: str,
|
|
688
|
+
file_name: str,
|
|
689
|
+
file_content: bytes,
|
|
690
|
+
user: User,
|
|
691
|
+
) -> None:
|
|
692
|
+
"""
|
|
693
|
+
Save a downloaded file to local storage and register it in the dataset.
|
|
694
|
+
Uses the existing ingest_data function for consistency with normal ingestion.
|
|
695
|
+
|
|
696
|
+
Args:
|
|
697
|
+
dataset: The dataset to add the file to
|
|
698
|
+
file_hash: MD5 hash of the file content
|
|
699
|
+
file_name: Original file name
|
|
700
|
+
file_content: Raw file content bytes
|
|
701
|
+
"""
|
|
702
|
+
try:
|
|
703
|
+
# Create a temporary file-like object from the bytes
|
|
704
|
+
file_obj = InMemoryDownload(file_content, file_name)
|
|
705
|
+
|
|
706
|
+
# User is injected as dependency
|
|
707
|
+
|
|
708
|
+
# Use the existing ingest_data function to properly handle the file
|
|
709
|
+
# This ensures consistency with normal file ingestion
|
|
710
|
+
await ingest_data(
|
|
711
|
+
data=file_obj,
|
|
712
|
+
dataset_name=dataset.name,
|
|
713
|
+
user=user,
|
|
714
|
+
dataset_id=dataset.id,
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
logger.debug(f"Successfully saved downloaded file: {file_name} (hash: {file_hash})")
|
|
718
|
+
|
|
719
|
+
except Exception as e:
|
|
720
|
+
logger.error(f"Failed to save downloaded file {file_name}: {str(e)}")
|
|
721
|
+
raise
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
async def _upload_missing_files(
|
|
725
|
+
cloud_base_url: str,
|
|
726
|
+
auth_token: str,
|
|
727
|
+
dataset: Dataset,
|
|
728
|
+
local_files: List[LocalFileInfo],
|
|
729
|
+
hashes_missing_on_remote: List[str],
|
|
730
|
+
run_id: str,
|
|
731
|
+
) -> int:
|
|
732
|
+
"""
|
|
733
|
+
Upload files that are missing on cloud.
|
|
734
|
+
|
|
735
|
+
Returns:
|
|
736
|
+
int: Total bytes uploaded
|
|
737
|
+
"""
|
|
738
|
+
# Filter local files to only those with missing hashes
|
|
739
|
+
files_to_upload = [f for f in local_files if f.content_hash in hashes_missing_on_remote]
|
|
740
|
+
|
|
741
|
+
logger.info(f"Uploading {len(files_to_upload)} missing files to cloud")
|
|
742
|
+
|
|
743
|
+
if not files_to_upload:
|
|
744
|
+
logger.info("No files need to be uploaded - all files already exist on cloud")
|
|
745
|
+
return 0
|
|
746
|
+
|
|
747
|
+
total_bytes_uploaded = 0
|
|
748
|
+
uploaded_count = 0
|
|
749
|
+
|
|
750
|
+
headers = {"X-Api-Key": auth_token}
|
|
751
|
+
|
|
752
|
+
async with aiohttp.ClientSession() as session:
|
|
753
|
+
for file_info in files_to_upload:
|
|
754
|
+
try:
|
|
755
|
+
file_dir = os.path.dirname(file_info.raw_data_location)
|
|
756
|
+
file_name = os.path.basename(file_info.raw_data_location)
|
|
757
|
+
file_storage = get_file_storage(file_dir)
|
|
758
|
+
|
|
759
|
+
async with file_storage.open(file_name, mode="rb") as file:
|
|
760
|
+
file_content = file.read()
|
|
761
|
+
|
|
762
|
+
# Upload file
|
|
763
|
+
url = f"{cloud_base_url}/api/sync/{dataset.id}/data/{file_info.id}"
|
|
764
|
+
|
|
765
|
+
request_data = aiohttp.FormData()
|
|
766
|
+
|
|
767
|
+
request_data.add_field(
|
|
768
|
+
"file", file_content, content_type=file_info.mime_type, filename=file_info.name
|
|
769
|
+
)
|
|
770
|
+
request_data.add_field("dataset_id", str(dataset.id))
|
|
771
|
+
request_data.add_field("dataset_name", dataset.name)
|
|
772
|
+
request_data.add_field("data_id", str(file_info.id))
|
|
773
|
+
request_data.add_field("mime_type", file_info.mime_type)
|
|
774
|
+
request_data.add_field("extension", file_info.extension)
|
|
775
|
+
request_data.add_field("md5", file_info.content_hash)
|
|
776
|
+
|
|
777
|
+
async with session.put(url, data=request_data, headers=headers) as response:
|
|
778
|
+
if response.status in [200, 201]:
|
|
779
|
+
total_bytes_uploaded += len(file_content)
|
|
780
|
+
uploaded_count += 1
|
|
781
|
+
else:
|
|
782
|
+
error_text = await response.text()
|
|
783
|
+
logger.error(
|
|
784
|
+
f"Failed to upload {file_info.name}: Status {response.status} - {error_text}"
|
|
785
|
+
)
|
|
786
|
+
raise ConnectionError(
|
|
787
|
+
f"Upload failed for {file_info.name}: HTTP {response.status} - {error_text}"
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
except Exception as e:
|
|
791
|
+
logger.error(f"Error uploading file {file_info.name}: {str(e)}")
|
|
792
|
+
raise ConnectionError(f"Upload failed for {file_info.name}: {str(e)}")
|
|
793
|
+
|
|
794
|
+
logger.info(f"All {uploaded_count} files uploaded successfully: {total_bytes_uploaded} bytes")
|
|
795
|
+
return total_bytes_uploaded
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
async def _prune_cloud_dataset(
|
|
799
|
+
cloud_base_url: str, auth_token: str, dataset_id: str, local_hashes: List[str], run_id: str
|
|
800
|
+
) -> None:
|
|
801
|
+
"""
|
|
802
|
+
Prune cloud dataset to match local state.
|
|
803
|
+
"""
|
|
804
|
+
url = f"{cloud_base_url}/api/sync/{dataset_id}?prune=true"
|
|
805
|
+
headers = {"X-Api-Key": auth_token, "Content-Type": "application/json"}
|
|
806
|
+
|
|
807
|
+
payload = PruneDatasetRequest(items=local_hashes)
|
|
808
|
+
|
|
809
|
+
logger.info("Pruning cloud dataset to match local state")
|
|
810
|
+
|
|
811
|
+
try:
|
|
812
|
+
async with aiohttp.ClientSession() as session:
|
|
813
|
+
async with session.put(url, json=payload.dict(), headers=headers) as response:
|
|
814
|
+
if response.status == 200:
|
|
815
|
+
data = await response.json()
|
|
816
|
+
deleted_entries = data.get("deleted_database_entries", 0)
|
|
817
|
+
deleted_files = data.get("deleted_files_from_storage", 0)
|
|
818
|
+
|
|
819
|
+
logger.info(
|
|
820
|
+
f"Cloud dataset pruned successfully: {deleted_entries} entries deleted, {deleted_files} files removed"
|
|
821
|
+
)
|
|
822
|
+
else:
|
|
823
|
+
error_text = await response.text()
|
|
824
|
+
logger.error(
|
|
825
|
+
f"Failed to prune cloud dataset: Status {response.status} - {error_text}"
|
|
826
|
+
)
|
|
827
|
+
# Don't raise error for prune failures - sync partially succeeded
|
|
828
|
+
|
|
829
|
+
except Exception as e:
|
|
830
|
+
logger.error(f"Error pruning cloud dataset: {str(e)}")
|
|
831
|
+
# Don't raise error for prune failures - sync partially succeeded
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
async def _trigger_remote_cognify(
|
|
835
|
+
cloud_base_url: str, auth_token: str, dataset_id: str, run_id: str
|
|
836
|
+
) -> None:
|
|
837
|
+
"""
|
|
838
|
+
Trigger cognify processing on the cloud dataset.
|
|
839
|
+
|
|
840
|
+
This initiates knowledge graph processing on the synchronized dataset
|
|
841
|
+
using the cloud infrastructure.
|
|
842
|
+
"""
|
|
843
|
+
url = f"{cloud_base_url}/api/cognify"
|
|
844
|
+
headers = {"X-Api-Key": auth_token, "Content-Type": "application/json"}
|
|
845
|
+
|
|
846
|
+
payload = {
|
|
847
|
+
"dataset_ids": [str(dataset_id)], # Convert UUID to string for JSON serialization
|
|
848
|
+
"run_in_background": False,
|
|
849
|
+
"custom_prompt": "",
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
logger.info(f"Triggering cognify processing for dataset {dataset_id}")
|
|
853
|
+
|
|
854
|
+
try:
|
|
855
|
+
async with aiohttp.ClientSession() as session:
|
|
856
|
+
async with session.post(url, json=payload, headers=headers) as response:
|
|
857
|
+
if response.status == 200:
|
|
858
|
+
data = await response.json()
|
|
859
|
+
logger.info(f"Cognify processing started successfully: {data}")
|
|
860
|
+
|
|
861
|
+
# Extract pipeline run IDs for monitoring if available
|
|
862
|
+
if isinstance(data, dict):
|
|
863
|
+
for dataset_key, run_info in data.items():
|
|
864
|
+
if isinstance(run_info, dict) and "pipeline_run_id" in run_info:
|
|
865
|
+
logger.info(
|
|
866
|
+
f"Cognify pipeline run ID for dataset {dataset_key}: {run_info['pipeline_run_id']}"
|
|
867
|
+
)
|
|
868
|
+
else:
|
|
869
|
+
error_text = await response.text()
|
|
870
|
+
logger.warning(
|
|
871
|
+
f"Failed to trigger cognify processing: Status {response.status} - {error_text}"
|
|
872
|
+
)
|
|
873
|
+
# TODO: consider adding retries
|
|
874
|
+
|
|
875
|
+
except Exception as e:
|
|
876
|
+
logger.warning(f"Error triggering cognify processing: {str(e)}")
|
|
877
|
+
# TODO: consider adding retries
|