appkit-assistant 0.17.3__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- appkit_assistant/backend/{models.py → database/models.py} +32 -132
- appkit_assistant/backend/{repositories.py → database/repositories.py} +93 -1
- appkit_assistant/backend/model_manager.py +5 -5
- appkit_assistant/backend/models/__init__.py +28 -0
- appkit_assistant/backend/models/anthropic.py +31 -0
- appkit_assistant/backend/models/google.py +27 -0
- appkit_assistant/backend/models/openai.py +50 -0
- appkit_assistant/backend/models/perplexity.py +56 -0
- appkit_assistant/backend/processors/__init__.py +29 -0
- appkit_assistant/backend/processors/claude_responses_processor.py +205 -387
- appkit_assistant/backend/processors/gemini_responses_processor.py +290 -352
- appkit_assistant/backend/processors/lorem_ipsum_processor.py +6 -4
- appkit_assistant/backend/processors/mcp_mixin.py +297 -0
- appkit_assistant/backend/processors/openai_base.py +11 -125
- appkit_assistant/backend/processors/openai_chat_completion_processor.py +5 -3
- appkit_assistant/backend/processors/openai_responses_processor.py +480 -402
- appkit_assistant/backend/processors/perplexity_processor.py +156 -79
- appkit_assistant/backend/{processor.py → processors/processor_base.py} +7 -2
- appkit_assistant/backend/processors/streaming_base.py +188 -0
- appkit_assistant/backend/schemas.py +138 -0
- appkit_assistant/backend/services/auth_error_detector.py +99 -0
- appkit_assistant/backend/services/chunk_factory.py +273 -0
- appkit_assistant/backend/services/citation_handler.py +292 -0
- appkit_assistant/backend/services/file_cleanup_service.py +316 -0
- appkit_assistant/backend/services/file_upload_service.py +903 -0
- appkit_assistant/backend/services/file_validation.py +138 -0
- appkit_assistant/backend/{mcp_auth_service.py → services/mcp_auth_service.py} +4 -2
- appkit_assistant/backend/services/mcp_token_service.py +61 -0
- appkit_assistant/backend/services/message_converter.py +289 -0
- appkit_assistant/backend/services/openai_client_service.py +120 -0
- appkit_assistant/backend/{response_accumulator.py → services/response_accumulator.py} +163 -1
- appkit_assistant/backend/services/system_prompt_builder.py +89 -0
- appkit_assistant/backend/services/thread_service.py +5 -3
- appkit_assistant/backend/system_prompt_cache.py +3 -3
- appkit_assistant/components/__init__.py +8 -4
- appkit_assistant/components/composer.py +59 -24
- appkit_assistant/components/file_manager.py +623 -0
- appkit_assistant/components/mcp_server_dialogs.py +12 -20
- appkit_assistant/components/mcp_server_table.py +12 -2
- appkit_assistant/components/message.py +119 -2
- appkit_assistant/components/thread.py +1 -1
- appkit_assistant/components/threadlist.py +4 -2
- appkit_assistant/components/tools_modal.py +37 -20
- appkit_assistant/configuration.py +12 -0
- appkit_assistant/state/file_manager_state.py +697 -0
- appkit_assistant/state/mcp_oauth_state.py +3 -3
- appkit_assistant/state/mcp_server_state.py +47 -2
- appkit_assistant/state/system_prompt_state.py +1 -1
- appkit_assistant/state/thread_list_state.py +99 -5
- appkit_assistant/state/thread_state.py +88 -9
- {appkit_assistant-0.17.3.dist-info → appkit_assistant-1.0.1.dist-info}/METADATA +8 -6
- appkit_assistant-1.0.1.dist-info/RECORD +58 -0
- appkit_assistant/backend/processors/claude_base.py +0 -178
- appkit_assistant/backend/processors/gemini_base.py +0 -84
- appkit_assistant-0.17.3.dist-info/RECORD +0 -39
- /appkit_assistant/backend/{file_manager.py → services/file_manager.py} +0 -0
- {appkit_assistant-0.17.3.dist-info → appkit_assistant-1.0.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,903 @@
|
|
|
1
|
+
"""File upload service for managing OpenAI file uploads and vector stores.
|
|
2
|
+
|
|
3
|
+
Handles uploading files to OpenAI, creating/managing vector stores per thread,
|
|
4
|
+
and tracking uploads in the database for cleanup purposes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
from collections.abc import AsyncGenerator
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from openai import AsyncOpenAI
|
|
14
|
+
from sqlalchemy import select
|
|
15
|
+
|
|
16
|
+
from appkit_assistant.backend.database.models import (
|
|
17
|
+
AssistantFileUpload,
|
|
18
|
+
AssistantThread,
|
|
19
|
+
)
|
|
20
|
+
from appkit_assistant.backend.database.repositories import file_upload_repo
|
|
21
|
+
from appkit_assistant.backend.schemas import (
|
|
22
|
+
Chunk,
|
|
23
|
+
ChunkType,
|
|
24
|
+
)
|
|
25
|
+
from appkit_assistant.backend.services.chunk_factory import ChunkFactory
|
|
26
|
+
from appkit_assistant.configuration import FileUploadConfig
|
|
27
|
+
from appkit_commons.database.session import get_asyncdb_session
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class FileUploadError(Exception):
|
|
33
|
+
"""Raised when file upload operations fail."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FileUploadService:
|
|
37
|
+
"""Service for managing file uploads to OpenAI and vector store lifecycle.
|
|
38
|
+
|
|
39
|
+
Handles:
|
|
40
|
+
- Uploading files to OpenAI with size/count validation
|
|
41
|
+
- Creating vector stores per thread with configurable expiration
|
|
42
|
+
- Adding files to existing vector stores
|
|
43
|
+
- Tracking uploads in database for cleanup
|
|
44
|
+
- Retry logic with cleanup on failure
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
client: AsyncOpenAI,
|
|
50
|
+
config: FileUploadConfig | None = None,
|
|
51
|
+
) -> None:
|
|
52
|
+
"""Initialize the file upload service.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
client: AsyncOpenAI client instance (shared from processor).
|
|
56
|
+
config: File upload configuration. Uses defaults if not provided.
|
|
57
|
+
"""
|
|
58
|
+
self.client = client
|
|
59
|
+
self.config = config or FileUploadConfig()
|
|
60
|
+
self._max_file_size_bytes = self.config.max_file_size_mb * 1024 * 1024
|
|
61
|
+
self._chunk_factory = ChunkFactory("file_upload_service")
|
|
62
|
+
|
|
63
|
+
async def _recreate_vector_store(
|
|
64
|
+
self,
|
|
65
|
+
session: Any,
|
|
66
|
+
thread: AssistantThread,
|
|
67
|
+
thread_uuid: str,
|
|
68
|
+
) -> tuple[str, str]:
|
|
69
|
+
"""Recreate a vector store that no longer exists in OpenAI.
|
|
70
|
+
|
|
71
|
+
Creates a new vector store and adds all existing files from the thread.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
session: Database session.
|
|
75
|
+
thread: The thread whose vector store needs recreation.
|
|
76
|
+
thread_uuid: UUID string of the thread (for naming).
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (new_vector_store_id, vector_store_name).
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
FileUploadError: If recreation fails.
|
|
83
|
+
"""
|
|
84
|
+
old_vector_store_id = thread.vector_store_id
|
|
85
|
+
|
|
86
|
+
# Get existing file records for this thread
|
|
87
|
+
existing_files = await file_upload_repo.find_by_thread(session, thread.id)
|
|
88
|
+
openai_file_ids = [f.openai_file_id for f in existing_files]
|
|
89
|
+
|
|
90
|
+
logger.info(
|
|
91
|
+
"Recreating vector store for thread %s with %d existing files",
|
|
92
|
+
thread_uuid,
|
|
93
|
+
len(openai_file_ids),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Create new vector store
|
|
97
|
+
vector_store = await self._create_vector_store_with_retry(thread_uuid)
|
|
98
|
+
new_vector_store_id = vector_store.id
|
|
99
|
+
vector_store_name = vector_store.name or f"Thread-{thread_uuid}"
|
|
100
|
+
|
|
101
|
+
# Add existing files to new vector store
|
|
102
|
+
files_added = 0
|
|
103
|
+
for file_id in openai_file_ids:
|
|
104
|
+
try:
|
|
105
|
+
await self.client.vector_stores.files.create(
|
|
106
|
+
vector_store_id=new_vector_store_id,
|
|
107
|
+
file_id=file_id,
|
|
108
|
+
)
|
|
109
|
+
files_added += 1
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.warning(
|
|
112
|
+
"Failed to add file %s to new vector store: %s",
|
|
113
|
+
file_id,
|
|
114
|
+
e,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Update thread with new vector store ID
|
|
118
|
+
thread.vector_store_id = new_vector_store_id
|
|
119
|
+
session.add(thread)
|
|
120
|
+
|
|
121
|
+
# Update all file records with new vector store ID
|
|
122
|
+
for file_record in existing_files:
|
|
123
|
+
file_record.vector_store_id = new_vector_store_id
|
|
124
|
+
file_record.vector_store_name = vector_store_name
|
|
125
|
+
session.add(file_record)
|
|
126
|
+
|
|
127
|
+
await session.commit()
|
|
128
|
+
|
|
129
|
+
logger.info(
|
|
130
|
+
"Recreated vector store: %s -> %s (%d/%d files migrated)",
|
|
131
|
+
old_vector_store_id,
|
|
132
|
+
new_vector_store_id,
|
|
133
|
+
files_added,
|
|
134
|
+
len(openai_file_ids),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return new_vector_store_id, vector_store_name
|
|
138
|
+
|
|
139
|
+
async def _add_files_to_vector_store(
|
|
140
|
+
self,
|
|
141
|
+
vector_store_id: str,
|
|
142
|
+
vector_store_name: str,
|
|
143
|
+
file_ids: list[str],
|
|
144
|
+
thread_id: int,
|
|
145
|
+
user_id: int,
|
|
146
|
+
filenames: list[str],
|
|
147
|
+
file_sizes: list[int],
|
|
148
|
+
) -> None:
|
|
149
|
+
"""Add uploaded files to a vector store and track in database (private helper).
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
vector_store_id: The vector store to add files to.
|
|
153
|
+
vector_store_name: The name of the vector store.
|
|
154
|
+
file_ids: List of OpenAI file IDs to add.
|
|
155
|
+
thread_id: Database ID of the thread.
|
|
156
|
+
user_id: ID of the user who uploaded the files.
|
|
157
|
+
filenames: Original filenames for each file.
|
|
158
|
+
file_sizes: Size in bytes for each file.
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
FileUploadError: If adding files fails.
|
|
162
|
+
"""
|
|
163
|
+
if not file_ids:
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
# Add files to vector store
|
|
167
|
+
for file_id in file_ids:
|
|
168
|
+
try:
|
|
169
|
+
await self.client.vector_stores.files.create(
|
|
170
|
+
vector_store_id=vector_store_id,
|
|
171
|
+
file_id=file_id,
|
|
172
|
+
)
|
|
173
|
+
logger.debug(
|
|
174
|
+
"Added file %s to vector store %s",
|
|
175
|
+
file_id,
|
|
176
|
+
vector_store_id,
|
|
177
|
+
)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
logger.error(
|
|
180
|
+
"Failed to add file %s to vector store: %s",
|
|
181
|
+
file_id,
|
|
182
|
+
e,
|
|
183
|
+
)
|
|
184
|
+
raise FileUploadError(f"Failed to add file to vector store: {e}") from e
|
|
185
|
+
|
|
186
|
+
# Track in database
|
|
187
|
+
async with get_asyncdb_session() as session:
|
|
188
|
+
for file_id, filename, size in zip(
|
|
189
|
+
file_ids, filenames, file_sizes, strict=True
|
|
190
|
+
):
|
|
191
|
+
upload_record = AssistantFileUpload(
|
|
192
|
+
filename=filename,
|
|
193
|
+
openai_file_id=file_id,
|
|
194
|
+
vector_store_id=vector_store_id,
|
|
195
|
+
vector_store_name=vector_store_name,
|
|
196
|
+
thread_id=thread_id,
|
|
197
|
+
user_id=user_id,
|
|
198
|
+
file_size=size,
|
|
199
|
+
)
|
|
200
|
+
session.add(upload_record)
|
|
201
|
+
|
|
202
|
+
await session.commit()
|
|
203
|
+
logger.debug(
|
|
204
|
+
"Tracked %d file uploads in database",
|
|
205
|
+
len(file_ids),
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
async def _validate_file_count(self, thread_id: int) -> None:
|
|
209
|
+
"""Validate that adding another file won't exceed the limit."""
|
|
210
|
+
async with get_asyncdb_session() as session:
|
|
211
|
+
result = await session.execute(
|
|
212
|
+
select(AssistantFileUpload).where(
|
|
213
|
+
AssistantFileUpload.thread_id == thread_id
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
existing_count = len(result.scalars().all())
|
|
217
|
+
|
|
218
|
+
if existing_count >= self.config.max_files_per_thread:
|
|
219
|
+
raise FileUploadError(
|
|
220
|
+
f"Maximum files per thread ({self.config.max_files_per_thread}) "
|
|
221
|
+
"reached"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
async def _upload_with_retry(self, path: Path, max_retries: int = 2) -> str:
|
|
225
|
+
"""Upload file to OpenAI with retry logic.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
path: Path to the file.
|
|
229
|
+
max_retries: Maximum number of attempts.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
The OpenAI file ID.
|
|
233
|
+
|
|
234
|
+
Raises:
|
|
235
|
+
FileUploadError: If all retries fail.
|
|
236
|
+
"""
|
|
237
|
+
last_error: Exception | None = None
|
|
238
|
+
|
|
239
|
+
for attempt in range(max_retries):
|
|
240
|
+
try:
|
|
241
|
+
file_content = path.read_bytes()
|
|
242
|
+
vs_file = await self.client.files.create(
|
|
243
|
+
file=(path.name, file_content),
|
|
244
|
+
purpose="assistants",
|
|
245
|
+
)
|
|
246
|
+
return vs_file.id
|
|
247
|
+
except Exception as e:
|
|
248
|
+
last_error = e
|
|
249
|
+
logger.warning(
|
|
250
|
+
"File upload attempt %d failed: %s",
|
|
251
|
+
attempt + 1,
|
|
252
|
+
e,
|
|
253
|
+
)
|
|
254
|
+
if attempt < max_retries - 1:
|
|
255
|
+
await asyncio.sleep(1)
|
|
256
|
+
|
|
257
|
+
msg = f"Failed to upload file after {max_retries} attempts"
|
|
258
|
+
raise FileUploadError(msg) from last_error
|
|
259
|
+
|
|
260
|
+
async def _wait_for_processing( # noqa: PLR0912
|
|
261
|
+
self,
|
|
262
|
+
vector_store_id: str,
|
|
263
|
+
file_ids: list[str],
|
|
264
|
+
filenames: list[str],
|
|
265
|
+
max_wait_seconds: int = 60,
|
|
266
|
+
) -> AsyncGenerator[Chunk, None]:
|
|
267
|
+
"""Wait for files to be processed, yielding progress chunks in real-time.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
vector_store_id: The vector store containing the files.
|
|
271
|
+
file_ids: List of file IDs to wait for.
|
|
272
|
+
filenames: List of original filenames for progress display.
|
|
273
|
+
max_wait_seconds: Maximum seconds to wait.
|
|
274
|
+
|
|
275
|
+
Yields:
|
|
276
|
+
Chunk objects with processing status updates.
|
|
277
|
+
"""
|
|
278
|
+
if not file_ids:
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
# Map file IDs to filenames for display
|
|
282
|
+
file_id_to_name = dict(zip(file_ids, filenames, strict=True))
|
|
283
|
+
total_files = len(file_ids)
|
|
284
|
+
completed_count = 0
|
|
285
|
+
|
|
286
|
+
# Initial processing chunk
|
|
287
|
+
if total_files == 1:
|
|
288
|
+
initial_text = f"Indiziere: {filenames[0]}"
|
|
289
|
+
else:
|
|
290
|
+
initial_text = f"Indiziere {total_files} Dateien..."
|
|
291
|
+
yield self._chunk_factory.create(
|
|
292
|
+
ChunkType.PROCESSING,
|
|
293
|
+
initial_text,
|
|
294
|
+
{
|
|
295
|
+
"status": "indexing",
|
|
296
|
+
"total_files": total_files,
|
|
297
|
+
"completed_files": 0,
|
|
298
|
+
},
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
start_time = asyncio.get_event_loop().time()
|
|
302
|
+
pending_files = set(file_ids)
|
|
303
|
+
success = True
|
|
304
|
+
|
|
305
|
+
loop = asyncio.get_event_loop()
|
|
306
|
+
while pending_files and (loop.time() - start_time) < max_wait_seconds:
|
|
307
|
+
vs_files = await self.client.vector_stores.files.list(
|
|
308
|
+
vector_store_id=vector_store_id
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
for vs_file in vs_files.data:
|
|
312
|
+
if vs_file.id in pending_files:
|
|
313
|
+
if vs_file.status == "completed":
|
|
314
|
+
pending_files.discard(vs_file.id)
|
|
315
|
+
completed_count += 1
|
|
316
|
+
filename = file_id_to_name.get(vs_file.id, vs_file.id)
|
|
317
|
+
logger.debug("File indexed: %s", vs_file.id)
|
|
318
|
+
|
|
319
|
+
# Progress update chunk
|
|
320
|
+
progress_text = f"Indiziert: {filename}"
|
|
321
|
+
yield self._chunk_factory.create(
|
|
322
|
+
ChunkType.PROCESSING,
|
|
323
|
+
progress_text,
|
|
324
|
+
{
|
|
325
|
+
"status": "progress",
|
|
326
|
+
"total_files": total_files,
|
|
327
|
+
"completed_files": completed_count,
|
|
328
|
+
"current_file": filename,
|
|
329
|
+
},
|
|
330
|
+
)
|
|
331
|
+
elif vs_file.status in ("failed", "cancelled"):
|
|
332
|
+
error_msg = ""
|
|
333
|
+
if vs_file.last_error:
|
|
334
|
+
error_msg = vs_file.last_error.message
|
|
335
|
+
logger.error(
|
|
336
|
+
"File indexing failed: %s - %s",
|
|
337
|
+
vs_file.id,
|
|
338
|
+
error_msg,
|
|
339
|
+
)
|
|
340
|
+
failed_name = file_id_to_name.get(vs_file.id, vs_file.id)
|
|
341
|
+
yield self._chunk_factory.create(
|
|
342
|
+
ChunkType.PROCESSING,
|
|
343
|
+
f"Fehler: {failed_name}",
|
|
344
|
+
{
|
|
345
|
+
"status": "failed",
|
|
346
|
+
"total_files": total_files,
|
|
347
|
+
"completed_files": completed_count,
|
|
348
|
+
"error": error_msg,
|
|
349
|
+
},
|
|
350
|
+
)
|
|
351
|
+
pending_files.discard(vs_file.id)
|
|
352
|
+
success = False
|
|
353
|
+
|
|
354
|
+
if pending_files:
|
|
355
|
+
await asyncio.sleep(1)
|
|
356
|
+
|
|
357
|
+
if pending_files:
|
|
358
|
+
logger.warning("Timeout waiting for files: %s", pending_files)
|
|
359
|
+
yield self._chunk_factory.create(
|
|
360
|
+
ChunkType.PROCESSING,
|
|
361
|
+
f"Zeitüberschreitung ({completed_count}/{total_files})",
|
|
362
|
+
{
|
|
363
|
+
"status": "timeout",
|
|
364
|
+
"total_files": total_files,
|
|
365
|
+
"completed_files": completed_count,
|
|
366
|
+
},
|
|
367
|
+
)
|
|
368
|
+
return
|
|
369
|
+
|
|
370
|
+
# Final success chunk
|
|
371
|
+
if success:
|
|
372
|
+
if total_files == 1:
|
|
373
|
+
done_text = f"Bereit: {filenames[0]}"
|
|
374
|
+
else:
|
|
375
|
+
done_text = f"{total_files} Dateien bereit"
|
|
376
|
+
yield self._chunk_factory.create(
|
|
377
|
+
ChunkType.PROCESSING,
|
|
378
|
+
done_text,
|
|
379
|
+
{
|
|
380
|
+
"status": "completed",
|
|
381
|
+
"total_files": total_files,
|
|
382
|
+
"completed_files": total_files,
|
|
383
|
+
},
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
async def _create_vector_store_with_retry(
|
|
387
|
+
self,
|
|
388
|
+
thread_uuid: str,
|
|
389
|
+
max_retries: int = 2,
|
|
390
|
+
) -> Any:
|
|
391
|
+
"""Create vector store with retry logic.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
thread_uuid: Thread UUID for naming the store.
|
|
395
|
+
max_retries: Maximum number of attempts.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
The created vector store object.
|
|
399
|
+
|
|
400
|
+
Raises:
|
|
401
|
+
FileUploadError: If all retries fail.
|
|
402
|
+
"""
|
|
403
|
+
last_error: Exception | None = None
|
|
404
|
+
|
|
405
|
+
for attempt in range(max_retries):
|
|
406
|
+
try:
|
|
407
|
+
return await self.client.vector_stores.create(
|
|
408
|
+
name=f"Thread-{thread_uuid}",
|
|
409
|
+
expires_after={
|
|
410
|
+
"anchor": "last_active_at",
|
|
411
|
+
"days": self.config.vector_store_expiration_days,
|
|
412
|
+
},
|
|
413
|
+
)
|
|
414
|
+
except Exception as e:
|
|
415
|
+
last_error = e
|
|
416
|
+
logger.warning(
|
|
417
|
+
"Vector store creation attempt %d failed: %s",
|
|
418
|
+
attempt + 1,
|
|
419
|
+
e,
|
|
420
|
+
)
|
|
421
|
+
if attempt < max_retries - 1:
|
|
422
|
+
await asyncio.sleep(1)
|
|
423
|
+
|
|
424
|
+
raise FileUploadError(
|
|
425
|
+
f"Failed to create vector store after {max_retries} attempts"
|
|
426
|
+
) from last_error
|
|
427
|
+
|
|
428
|
+
async def _delete_files_from_vector_stores(
|
|
429
|
+
self, db_files: list[AssistantFileUpload]
|
|
430
|
+
) -> None:
|
|
431
|
+
"""Delete files FROM their vector stores (Level 1)."""
|
|
432
|
+
# Build map of vector_store_id -> file_ids
|
|
433
|
+
vector_store_files: dict[str, list[str]] = {}
|
|
434
|
+
for db_file in db_files:
|
|
435
|
+
if db_file.vector_store_id:
|
|
436
|
+
if db_file.vector_store_id not in vector_store_files:
|
|
437
|
+
vector_store_files[db_file.vector_store_id] = []
|
|
438
|
+
vector_store_files[db_file.vector_store_id].append(
|
|
439
|
+
db_file.openai_file_id
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# Delete from each vector store
|
|
443
|
+
for vs_id, vs_file_ids in vector_store_files.items():
|
|
444
|
+
for file_id in vs_file_ids:
|
|
445
|
+
try:
|
|
446
|
+
await self.client.vector_stores.files.delete(
|
|
447
|
+
vector_store_id=vs_id,
|
|
448
|
+
file_id=file_id,
|
|
449
|
+
)
|
|
450
|
+
logger.debug("Deleted file %s from vector store %s", file_id, vs_id)
|
|
451
|
+
except Exception as e:
|
|
452
|
+
logger.warning(
|
|
453
|
+
"Failed to delete file %s from vector store %s: %s",
|
|
454
|
+
file_id,
|
|
455
|
+
vs_id,
|
|
456
|
+
e,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
async def _delete_files_from_openai(self, file_ids: list[str]) -> dict[str, bool]:
|
|
460
|
+
"""Delete files from OpenAI (Level 2)."""
|
|
461
|
+
results = {}
|
|
462
|
+
for file_id in file_ids:
|
|
463
|
+
try:
|
|
464
|
+
await self.client.files.delete(file_id=file_id)
|
|
465
|
+
logger.debug("Deleted OpenAI file: %s", file_id)
|
|
466
|
+
results[file_id] = True
|
|
467
|
+
except Exception as e:
|
|
468
|
+
logger.warning("Failed to delete OpenAI file %s: %s", file_id, e)
|
|
469
|
+
results[file_id] = False
|
|
470
|
+
return results
|
|
471
|
+
|
|
472
|
+
async def _delete_file_db_records(
|
|
473
|
+
self,
|
|
474
|
+
db_files: list[AssistantFileUpload],
|
|
475
|
+
deletion_results: dict[str, bool],
|
|
476
|
+
) -> None:
|
|
477
|
+
"""Delete database records for successfully deleted files (Level 3)."""
|
|
478
|
+
deleted_file_ids = [fid for fid, success in deletion_results.items() if success]
|
|
479
|
+
if not deleted_file_ids:
|
|
480
|
+
return
|
|
481
|
+
|
|
482
|
+
async with get_asyncdb_session() as session:
|
|
483
|
+
for db_file in db_files:
|
|
484
|
+
if db_file.openai_file_id in deleted_file_ids:
|
|
485
|
+
try:
|
|
486
|
+
await session.delete(db_file)
|
|
487
|
+
logger.debug(
|
|
488
|
+
"Deleted DB record for file: %s", db_file.openai_file_id
|
|
489
|
+
)
|
|
490
|
+
except Exception as e:
|
|
491
|
+
logger.warning(
|
|
492
|
+
"Failed to delete DB record for file %s: %s",
|
|
493
|
+
db_file.openai_file_id,
|
|
494
|
+
e,
|
|
495
|
+
)
|
|
496
|
+
await session.commit()
|
|
497
|
+
|
|
498
|
+
async def upload_file(
|
|
499
|
+
self,
|
|
500
|
+
file_path: str,
|
|
501
|
+
thread_id: int,
|
|
502
|
+
user_id: int, # noqa: ARG002
|
|
503
|
+
) -> str:
|
|
504
|
+
"""Upload a file to OpenAI for assistants/file_search.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
file_path: Local path to the file to upload.
|
|
508
|
+
thread_id: Database ID of the thread this file belongs to.
|
|
509
|
+
user_id: ID of the user uploading the file.
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
The OpenAI file ID.
|
|
513
|
+
|
|
514
|
+
Raises:
|
|
515
|
+
FileUploadError: If validation fails or upload errors occur.
|
|
516
|
+
"""
|
|
517
|
+
path = Path(file_path)
|
|
518
|
+
|
|
519
|
+
# Validate file exists
|
|
520
|
+
if not path.exists():
|
|
521
|
+
raise FileUploadError(f"Datei nicht gefunden: {file_path}")
|
|
522
|
+
|
|
523
|
+
# Validate file size
|
|
524
|
+
file_size = path.stat().st_size
|
|
525
|
+
if file_size > self._max_file_size_bytes:
|
|
526
|
+
raise FileUploadError(
|
|
527
|
+
f"Datei überschreitet die maximale Größe von {self.config.max_file_size_mb}MB"
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
# Validate file count for thread
|
|
531
|
+
await self._validate_file_count(thread_id)
|
|
532
|
+
|
|
533
|
+
# Upload to OpenAI with retry
|
|
534
|
+
openai_file_id = await self._upload_with_retry(path)
|
|
535
|
+
|
|
536
|
+
logger.info(
|
|
537
|
+
"Uploaded file to OpenAI: %s -> %s",
|
|
538
|
+
path.name,
|
|
539
|
+
openai_file_id,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
return openai_file_id
|
|
543
|
+
|
|
544
|
+
async def process_files( # noqa: PLR0912
|
|
545
|
+
self,
|
|
546
|
+
file_paths: list[str],
|
|
547
|
+
thread_db_id: int,
|
|
548
|
+
thread_uuid: str,
|
|
549
|
+
user_id: int,
|
|
550
|
+
) -> AsyncGenerator[Chunk, None]:
|
|
551
|
+
"""Process files for a thread, yielding progress chunks in real-time.
|
|
552
|
+
|
|
553
|
+
Uploads files, creates/gets vector store, adds files to it, and waits
|
|
554
|
+
for indexing - yielding progress updates as each step happens.
|
|
555
|
+
|
|
556
|
+
Final chunk has metadata with 'vector_store_id' key.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
file_paths: List of local file paths to process.
|
|
560
|
+
thread_db_id: Database ID of the thread.
|
|
561
|
+
thread_uuid: UUID string of the thread.
|
|
562
|
+
user_id: ID of the user.
|
|
563
|
+
|
|
564
|
+
Yields:
|
|
565
|
+
Chunk objects with real-time progress updates.
|
|
566
|
+
Final chunk contains 'vector_store_id' in metadata.
|
|
567
|
+
|
|
568
|
+
Raises:
|
|
569
|
+
FileUploadError: If any step fails (with cleanup of uploaded files).
|
|
570
|
+
"""
|
|
571
|
+
if not file_paths:
|
|
572
|
+
return
|
|
573
|
+
|
|
574
|
+
uploaded_file_ids: list[str] = []
|
|
575
|
+
filenames: list[str] = []
|
|
576
|
+
file_sizes: list[int] = []
|
|
577
|
+
total_files = len(file_paths)
|
|
578
|
+
vector_store_id: str | None = None
|
|
579
|
+
|
|
580
|
+
try:
|
|
581
|
+
# Phase 1: Upload files to OpenAI
|
|
582
|
+
for i, file_path in enumerate(file_paths, 1):
|
|
583
|
+
path = Path(file_path)
|
|
584
|
+
filename = path.name
|
|
585
|
+
|
|
586
|
+
yield self._chunk_factory.create(
|
|
587
|
+
ChunkType.PROCESSING,
|
|
588
|
+
f"Lade hoch: {filename} ({i}/{total_files})",
|
|
589
|
+
{
|
|
590
|
+
"status": "uploading",
|
|
591
|
+
"current_file": filename,
|
|
592
|
+
"file_index": i,
|
|
593
|
+
"total_files": total_files,
|
|
594
|
+
},
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
file_id = await self.upload_file(file_path, thread_db_id, user_id)
|
|
598
|
+
uploaded_file_ids.append(file_id)
|
|
599
|
+
filenames.append(filename)
|
|
600
|
+
file_sizes.append(path.stat().st_size)
|
|
601
|
+
|
|
602
|
+
# Phase 2: Get or create vector store
|
|
603
|
+
yield self._chunk_factory.create(
|
|
604
|
+
ChunkType.PROCESSING,
|
|
605
|
+
"Bereite Vector Store vor...",
|
|
606
|
+
{"status": "preparing_store"},
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
vector_store_id, vector_store_name = await self.get_vector_store(
|
|
610
|
+
thread_db_id, thread_uuid
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
# Phase 3: Add files to vector store
|
|
614
|
+
for i, (file_id, filename) in enumerate(
|
|
615
|
+
zip(uploaded_file_ids, filenames, strict=True), 1
|
|
616
|
+
):
|
|
617
|
+
yield self._chunk_factory.create(
|
|
618
|
+
ChunkType.PROCESSING,
|
|
619
|
+
f"Füge hinzu: {filename} ({i}/{total_files})",
|
|
620
|
+
{
|
|
621
|
+
"status": "adding_to_store",
|
|
622
|
+
"current_file": filename,
|
|
623
|
+
"file_index": i,
|
|
624
|
+
"total_files": total_files,
|
|
625
|
+
},
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
await self.client.vector_stores.files.create(
|
|
629
|
+
vector_store_id=vector_store_id,
|
|
630
|
+
file_id=file_id,
|
|
631
|
+
)
|
|
632
|
+
logger.debug(
|
|
633
|
+
"Added file %s to vector store %s", file_id, vector_store_id
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
# Track in database
|
|
637
|
+
async with get_asyncdb_session() as session:
|
|
638
|
+
for file_id, filename, size in zip(
|
|
639
|
+
uploaded_file_ids, filenames, file_sizes, strict=True
|
|
640
|
+
):
|
|
641
|
+
upload_record = AssistantFileUpload(
|
|
642
|
+
filename=filename,
|
|
643
|
+
openai_file_id=file_id,
|
|
644
|
+
vector_store_id=vector_store_id,
|
|
645
|
+
vector_store_name=vector_store_name,
|
|
646
|
+
thread_id=thread_db_id,
|
|
647
|
+
user_id=user_id,
|
|
648
|
+
file_size=size,
|
|
649
|
+
)
|
|
650
|
+
session.add(upload_record)
|
|
651
|
+
await session.commit()
|
|
652
|
+
logger.debug("Tracked %d file uploads in database", len(filenames))
|
|
653
|
+
|
|
654
|
+
# Phase 4: Wait for indexing with streaming progress
|
|
655
|
+
async for chunk in self._wait_for_processing(
|
|
656
|
+
vector_store_id, uploaded_file_ids, filenames
|
|
657
|
+
):
|
|
658
|
+
# Add vector_store_id to final chunk metadata
|
|
659
|
+
if chunk.chunk_metadata and chunk.chunk_metadata.get("status") in (
|
|
660
|
+
"completed",
|
|
661
|
+
"timeout",
|
|
662
|
+
"failed",
|
|
663
|
+
):
|
|
664
|
+
chunk.chunk_metadata["vector_store_id"] = vector_store_id
|
|
665
|
+
yield chunk
|
|
666
|
+
|
|
667
|
+
except Exception:
|
|
668
|
+
# Cleanup uploaded files on failure
|
|
669
|
+
if uploaded_file_ids:
|
|
670
|
+
logger.warning(
|
|
671
|
+
"Cleaning up %d uploaded files due to error",
|
|
672
|
+
len(uploaded_file_ids),
|
|
673
|
+
)
|
|
674
|
+
await self.delete_files(uploaded_file_ids)
|
|
675
|
+
raise
|
|
676
|
+
|
|
677
|
+
async def get_vector_store(
|
|
678
|
+
self,
|
|
679
|
+
thread_id: int,
|
|
680
|
+
thread_uuid: str,
|
|
681
|
+
) -> tuple[str, str]:
|
|
682
|
+
"""Get existing vector store for thread or create a new one.
|
|
683
|
+
|
|
684
|
+
Automatically validates the vector store exists in OpenAI and recreates
|
|
685
|
+
if missing.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
thread_id: Database ID of the thread.
|
|
689
|
+
thread_uuid: UUID string of the thread (for naming).
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
Tuple of (vector_store_id, vector_store_name).
|
|
693
|
+
|
|
694
|
+
Raises:
|
|
695
|
+
FileUploadError: If vector store creation fails.
|
|
696
|
+
"""
|
|
697
|
+
async with get_asyncdb_session() as session:
|
|
698
|
+
result = await session.execute(
|
|
699
|
+
select(AssistantThread).where(AssistantThread.id == thread_id)
|
|
700
|
+
)
|
|
701
|
+
thread = result.scalar_one_or_none()
|
|
702
|
+
|
|
703
|
+
if not thread:
|
|
704
|
+
raise FileUploadError(f"Thread not found: {thread_id}")
|
|
705
|
+
|
|
706
|
+
# Return existing vector store if present and valid
|
|
707
|
+
if thread.vector_store_id:
|
|
708
|
+
logger.debug(
|
|
709
|
+
"Checking existing vector store: %s",
|
|
710
|
+
thread.vector_store_id,
|
|
711
|
+
)
|
|
712
|
+
# Validate vector store exists in OpenAI
|
|
713
|
+
try:
|
|
714
|
+
vs = await self.client.vector_stores.retrieve(
|
|
715
|
+
thread.vector_store_id
|
|
716
|
+
)
|
|
717
|
+
return thread.vector_store_id, vs.name or ""
|
|
718
|
+
except Exception as e:
|
|
719
|
+
error_msg = str(e).lower()
|
|
720
|
+
if "not found" in error_msg or "404" in error_msg:
|
|
721
|
+
logger.warning(
|
|
722
|
+
"Vector store %s no longer exists, creating new one",
|
|
723
|
+
thread.vector_store_id,
|
|
724
|
+
)
|
|
725
|
+
# Vector store doesn't exist - create new one and migrate files
|
|
726
|
+
return await self._recreate_vector_store(
|
|
727
|
+
session, thread, thread_uuid
|
|
728
|
+
)
|
|
729
|
+
# Other error - log but try to continue
|
|
730
|
+
logger.warning(
|
|
731
|
+
"Error checking vector store %s: %s",
|
|
732
|
+
thread.vector_store_id,
|
|
733
|
+
e,
|
|
734
|
+
)
|
|
735
|
+
return thread.vector_store_id, f"Thread-{thread_uuid}"
|
|
736
|
+
|
|
737
|
+
# Create new vector store with expiration
|
|
738
|
+
vector_store = await self._create_vector_store_with_retry(thread_uuid)
|
|
739
|
+
vector_store_id = vector_store.id
|
|
740
|
+
vector_store_name = vector_store.name or f"Thread-{thread_uuid}"
|
|
741
|
+
|
|
742
|
+
# Update thread with vector store ID
|
|
743
|
+
thread.vector_store_id = vector_store_id
|
|
744
|
+
session.add(thread)
|
|
745
|
+
await session.commit()
|
|
746
|
+
|
|
747
|
+
logger.info(
|
|
748
|
+
"Created vector store for thread %s: %s",
|
|
749
|
+
thread_uuid,
|
|
750
|
+
vector_store_id,
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
return vector_store_id, vector_store_name
|
|
754
|
+
|
|
755
|
+
async def delete_files(self, file_ids: list[str]) -> dict[str, bool]:
|
|
756
|
+
"""Delete files with proper three-level ordering.
|
|
757
|
+
|
|
758
|
+
Order:
|
|
759
|
+
1. Delete files FROM their vector stores (remove association)
|
|
760
|
+
2. Delete files from OpenAI
|
|
761
|
+
3. Delete database records
|
|
762
|
+
|
|
763
|
+
Args:
|
|
764
|
+
file_ids: List of OpenAI file IDs to delete.
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
Dictionary mapping file_id to deletion success status.
|
|
768
|
+
"""
|
|
769
|
+
if not file_ids:
|
|
770
|
+
return {}
|
|
771
|
+
|
|
772
|
+
# Get file records from database to know which vector stores they belong to
|
|
773
|
+
async with get_asyncdb_session() as session:
|
|
774
|
+
file_records = await session.execute(
|
|
775
|
+
select(AssistantFileUpload).where(
|
|
776
|
+
AssistantFileUpload.openai_file_id.in_(file_ids)
|
|
777
|
+
)
|
|
778
|
+
)
|
|
779
|
+
db_files = file_records.scalars().all()
|
|
780
|
+
|
|
781
|
+
# LEVEL 1: Delete files FROM their vector stores
|
|
782
|
+
await self._delete_files_from_vector_stores(db_files)
|
|
783
|
+
|
|
784
|
+
# LEVEL 2: Delete files from OpenAI
|
|
785
|
+
results = await self._delete_files_from_openai(file_ids)
|
|
786
|
+
|
|
787
|
+
# LEVEL 3: Delete database records (only for successfully deleted files)
|
|
788
|
+
await self._delete_file_db_records(db_files, results)
|
|
789
|
+
|
|
790
|
+
return results
|
|
791
|
+
|
|
792
|
+
async def delete_vector_store(self, vector_store_id: str) -> bool:
|
|
793
|
+
"""Delete a vector store with proper ordering.
|
|
794
|
+
|
|
795
|
+
Order:
|
|
796
|
+
1. Delete all files in the vector store (via delete_files - 3-level deletion)
|
|
797
|
+
2. Delete the vector store container itself
|
|
798
|
+
|
|
799
|
+
Args:
|
|
800
|
+
vector_store_id: The vector store ID to delete.
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
True if vector store was successfully deleted, False otherwise.
|
|
804
|
+
"""
|
|
805
|
+
if not vector_store_id:
|
|
806
|
+
return False
|
|
807
|
+
|
|
808
|
+
logger.info("Deleting vector store: %s", vector_store_id)
|
|
809
|
+
|
|
810
|
+
# Step 1: List and delete all files in the vector store
|
|
811
|
+
try:
|
|
812
|
+
vs_files = await self.client.vector_stores.files.list(
|
|
813
|
+
vector_store_id=vector_store_id
|
|
814
|
+
)
|
|
815
|
+
file_ids = [vs_file.id for vs_file in vs_files.data]
|
|
816
|
+
|
|
817
|
+
if file_ids:
|
|
818
|
+
logger.info(
|
|
819
|
+
"Deleting %d files from vector store %s",
|
|
820
|
+
len(file_ids),
|
|
821
|
+
vector_store_id,
|
|
822
|
+
)
|
|
823
|
+
deletion_results = await self.delete_files(file_ids)
|
|
824
|
+
successful = sum(1 for success in deletion_results.values() if success)
|
|
825
|
+
logger.info(
|
|
826
|
+
"Successfully deleted %d/%d files from vector store %s",
|
|
827
|
+
successful,
|
|
828
|
+
len(file_ids),
|
|
829
|
+
vector_store_id,
|
|
830
|
+
)
|
|
831
|
+
except Exception as e:
|
|
832
|
+
logger.warning(
|
|
833
|
+
"Failed to delete files from vector store %s: %s",
|
|
834
|
+
vector_store_id,
|
|
835
|
+
e,
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
# Step 2: Delete the vector store container itself
|
|
839
|
+
try:
|
|
840
|
+
await self.client.vector_stores.delete(vector_store_id=vector_store_id)
|
|
841
|
+
logger.info("Deleted vector store: %s", vector_store_id)
|
|
842
|
+
return True
|
|
843
|
+
except Exception as e:
|
|
844
|
+
logger.warning(
|
|
845
|
+
"Failed to delete vector store %s (will auto-expire): %s",
|
|
846
|
+
vector_store_id,
|
|
847
|
+
e,
|
|
848
|
+
)
|
|
849
|
+
return False
|
|
850
|
+
|
|
851
|
+
async def cleanup_deleted_thread(
|
|
852
|
+
self,
|
|
853
|
+
thread_db_id: int,
|
|
854
|
+
vector_store_id: str | None,
|
|
855
|
+
) -> dict[str, Any]:
|
|
856
|
+
"""Clean up all resources for a deleted thread.
|
|
857
|
+
|
|
858
|
+
Deletion is handled by delete_vector_store which:
|
|
859
|
+
1. Deletes all files (3-level: from VS, from OpenAI, from DB)
|
|
860
|
+
2. Deletes the vector store container
|
|
861
|
+
|
|
862
|
+
Args:
|
|
863
|
+
thread_db_id: Database ID of the deleted thread.
|
|
864
|
+
vector_store_id: The vector store ID (if any) associated with the thread.
|
|
865
|
+
|
|
866
|
+
Returns:
|
|
867
|
+
Dictionary with cleanup statistics:
|
|
868
|
+
{
|
|
869
|
+
'vector_store_deleted': bool,
|
|
870
|
+
'thread_db_id': int,
|
|
871
|
+
'errors': list[str]
|
|
872
|
+
}
|
|
873
|
+
"""
|
|
874
|
+
logger.info("Starting cleanup for deleted thread %d", thread_db_id)
|
|
875
|
+
|
|
876
|
+
result = {
|
|
877
|
+
"vector_store_deleted": False,
|
|
878
|
+
"thread_db_id": thread_db_id,
|
|
879
|
+
"errors": [],
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
if not vector_store_id:
|
|
883
|
+
logger.debug(
|
|
884
|
+
"No vector store to clean up for thread %d",
|
|
885
|
+
thread_db_id,
|
|
886
|
+
)
|
|
887
|
+
return result
|
|
888
|
+
|
|
889
|
+
# Delete vector store (which handles all file deletion internally)
|
|
890
|
+
vs_deleted = await self.delete_vector_store(vector_store_id)
|
|
891
|
+
result["vector_store_deleted"] = vs_deleted
|
|
892
|
+
|
|
893
|
+
if not vs_deleted:
|
|
894
|
+
result["errors"].append(f"Failed to delete vector store {vector_store_id}")
|
|
895
|
+
|
|
896
|
+
logger.info(
|
|
897
|
+
"Cleanup completed for thread %d: VS=%s, errors=%d",
|
|
898
|
+
thread_db_id,
|
|
899
|
+
result["vector_store_deleted"],
|
|
900
|
+
len(result["errors"]),
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
return result
|