appkit-assistant 0.17.3__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. appkit_assistant/backend/{models.py → database/models.py} +32 -132
  2. appkit_assistant/backend/{repositories.py → database/repositories.py} +93 -1
  3. appkit_assistant/backend/model_manager.py +5 -5
  4. appkit_assistant/backend/models/__init__.py +28 -0
  5. appkit_assistant/backend/models/anthropic.py +31 -0
  6. appkit_assistant/backend/models/google.py +27 -0
  7. appkit_assistant/backend/models/openai.py +50 -0
  8. appkit_assistant/backend/models/perplexity.py +56 -0
  9. appkit_assistant/backend/processors/__init__.py +29 -0
  10. appkit_assistant/backend/processors/claude_responses_processor.py +205 -387
  11. appkit_assistant/backend/processors/gemini_responses_processor.py +290 -352
  12. appkit_assistant/backend/processors/lorem_ipsum_processor.py +6 -4
  13. appkit_assistant/backend/processors/mcp_mixin.py +297 -0
  14. appkit_assistant/backend/processors/openai_base.py +11 -125
  15. appkit_assistant/backend/processors/openai_chat_completion_processor.py +5 -3
  16. appkit_assistant/backend/processors/openai_responses_processor.py +480 -402
  17. appkit_assistant/backend/processors/perplexity_processor.py +156 -79
  18. appkit_assistant/backend/{processor.py → processors/processor_base.py} +7 -2
  19. appkit_assistant/backend/processors/streaming_base.py +188 -0
  20. appkit_assistant/backend/schemas.py +138 -0
  21. appkit_assistant/backend/services/auth_error_detector.py +99 -0
  22. appkit_assistant/backend/services/chunk_factory.py +273 -0
  23. appkit_assistant/backend/services/citation_handler.py +292 -0
  24. appkit_assistant/backend/services/file_cleanup_service.py +316 -0
  25. appkit_assistant/backend/services/file_upload_service.py +903 -0
  26. appkit_assistant/backend/services/file_validation.py +138 -0
  27. appkit_assistant/backend/{mcp_auth_service.py → services/mcp_auth_service.py} +4 -2
  28. appkit_assistant/backend/services/mcp_token_service.py +61 -0
  29. appkit_assistant/backend/services/message_converter.py +289 -0
  30. appkit_assistant/backend/services/openai_client_service.py +120 -0
  31. appkit_assistant/backend/{response_accumulator.py → services/response_accumulator.py} +163 -1
  32. appkit_assistant/backend/services/system_prompt_builder.py +89 -0
  33. appkit_assistant/backend/services/thread_service.py +5 -3
  34. appkit_assistant/backend/system_prompt_cache.py +3 -3
  35. appkit_assistant/components/__init__.py +8 -4
  36. appkit_assistant/components/composer.py +59 -24
  37. appkit_assistant/components/file_manager.py +623 -0
  38. appkit_assistant/components/mcp_server_dialogs.py +12 -20
  39. appkit_assistant/components/mcp_server_table.py +12 -2
  40. appkit_assistant/components/message.py +119 -2
  41. appkit_assistant/components/thread.py +1 -1
  42. appkit_assistant/components/threadlist.py +4 -2
  43. appkit_assistant/components/tools_modal.py +37 -20
  44. appkit_assistant/configuration.py +12 -0
  45. appkit_assistant/state/file_manager_state.py +697 -0
  46. appkit_assistant/state/mcp_oauth_state.py +3 -3
  47. appkit_assistant/state/mcp_server_state.py +47 -2
  48. appkit_assistant/state/system_prompt_state.py +1 -1
  49. appkit_assistant/state/thread_list_state.py +99 -5
  50. appkit_assistant/state/thread_state.py +88 -9
  51. {appkit_assistant-0.17.3.dist-info → appkit_assistant-1.0.1.dist-info}/METADATA +8 -6
  52. appkit_assistant-1.0.1.dist-info/RECORD +58 -0
  53. appkit_assistant/backend/processors/claude_base.py +0 -178
  54. appkit_assistant/backend/processors/gemini_base.py +0 -84
  55. appkit_assistant-0.17.3.dist-info/RECORD +0 -39
  56. /appkit_assistant/backend/{file_manager.py → services/file_manager.py} +0 -0
  57. {appkit_assistant-0.17.3.dist-info → appkit_assistant-1.0.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,903 @@
1
+ """File upload service for managing OpenAI file uploads and vector stores.
2
+
3
+ Handles uploading files to OpenAI, creating/managing vector stores per thread,
4
+ and tracking uploads in the database for cleanup purposes.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ from collections.abc import AsyncGenerator
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from openai import AsyncOpenAI
14
+ from sqlalchemy import select
15
+
16
+ from appkit_assistant.backend.database.models import (
17
+ AssistantFileUpload,
18
+ AssistantThread,
19
+ )
20
+ from appkit_assistant.backend.database.repositories import file_upload_repo
21
+ from appkit_assistant.backend.schemas import (
22
+ Chunk,
23
+ ChunkType,
24
+ )
25
+ from appkit_assistant.backend.services.chunk_factory import ChunkFactory
26
+ from appkit_assistant.configuration import FileUploadConfig
27
+ from appkit_commons.database.session import get_asyncdb_session
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class FileUploadError(Exception):
33
+ """Raised when file upload operations fail."""
34
+
35
+
36
+ class FileUploadService:
37
+ """Service for managing file uploads to OpenAI and vector store lifecycle.
38
+
39
+ Handles:
40
+ - Uploading files to OpenAI with size/count validation
41
+ - Creating vector stores per thread with configurable expiration
42
+ - Adding files to existing vector stores
43
+ - Tracking uploads in database for cleanup
44
+ - Retry logic with cleanup on failure
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ client: AsyncOpenAI,
50
+ config: FileUploadConfig | None = None,
51
+ ) -> None:
52
+ """Initialize the file upload service.
53
+
54
+ Args:
55
+ client: AsyncOpenAI client instance (shared from processor).
56
+ config: File upload configuration. Uses defaults if not provided.
57
+ """
58
+ self.client = client
59
+ self.config = config or FileUploadConfig()
60
+ self._max_file_size_bytes = self.config.max_file_size_mb * 1024 * 1024
61
+ self._chunk_factory = ChunkFactory("file_upload_service")
62
+
63
+ async def _recreate_vector_store(
64
+ self,
65
+ session: Any,
66
+ thread: AssistantThread,
67
+ thread_uuid: str,
68
+ ) -> tuple[str, str]:
69
+ """Recreate a vector store that no longer exists in OpenAI.
70
+
71
+ Creates a new vector store and adds all existing files from the thread.
72
+
73
+ Args:
74
+ session: Database session.
75
+ thread: The thread whose vector store needs recreation.
76
+ thread_uuid: UUID string of the thread (for naming).
77
+
78
+ Returns:
79
+ Tuple of (new_vector_store_id, vector_store_name).
80
+
81
+ Raises:
82
+ FileUploadError: If recreation fails.
83
+ """
84
+ old_vector_store_id = thread.vector_store_id
85
+
86
+ # Get existing file records for this thread
87
+ existing_files = await file_upload_repo.find_by_thread(session, thread.id)
88
+ openai_file_ids = [f.openai_file_id for f in existing_files]
89
+
90
+ logger.info(
91
+ "Recreating vector store for thread %s with %d existing files",
92
+ thread_uuid,
93
+ len(openai_file_ids),
94
+ )
95
+
96
+ # Create new vector store
97
+ vector_store = await self._create_vector_store_with_retry(thread_uuid)
98
+ new_vector_store_id = vector_store.id
99
+ vector_store_name = vector_store.name or f"Thread-{thread_uuid}"
100
+
101
+ # Add existing files to new vector store
102
+ files_added = 0
103
+ for file_id in openai_file_ids:
104
+ try:
105
+ await self.client.vector_stores.files.create(
106
+ vector_store_id=new_vector_store_id,
107
+ file_id=file_id,
108
+ )
109
+ files_added += 1
110
+ except Exception as e:
111
+ logger.warning(
112
+ "Failed to add file %s to new vector store: %s",
113
+ file_id,
114
+ e,
115
+ )
116
+
117
+ # Update thread with new vector store ID
118
+ thread.vector_store_id = new_vector_store_id
119
+ session.add(thread)
120
+
121
+ # Update all file records with new vector store ID
122
+ for file_record in existing_files:
123
+ file_record.vector_store_id = new_vector_store_id
124
+ file_record.vector_store_name = vector_store_name
125
+ session.add(file_record)
126
+
127
+ await session.commit()
128
+
129
+ logger.info(
130
+ "Recreated vector store: %s -> %s (%d/%d files migrated)",
131
+ old_vector_store_id,
132
+ new_vector_store_id,
133
+ files_added,
134
+ len(openai_file_ids),
135
+ )
136
+
137
+ return new_vector_store_id, vector_store_name
138
+
139
+ async def _add_files_to_vector_store(
140
+ self,
141
+ vector_store_id: str,
142
+ vector_store_name: str,
143
+ file_ids: list[str],
144
+ thread_id: int,
145
+ user_id: int,
146
+ filenames: list[str],
147
+ file_sizes: list[int],
148
+ ) -> None:
149
+ """Add uploaded files to a vector store and track in database (private helper).
150
+
151
+ Args:
152
+ vector_store_id: The vector store to add files to.
153
+ vector_store_name: The name of the vector store.
154
+ file_ids: List of OpenAI file IDs to add.
155
+ thread_id: Database ID of the thread.
156
+ user_id: ID of the user who uploaded the files.
157
+ filenames: Original filenames for each file.
158
+ file_sizes: Size in bytes for each file.
159
+
160
+ Raises:
161
+ FileUploadError: If adding files fails.
162
+ """
163
+ if not file_ids:
164
+ return
165
+
166
+ # Add files to vector store
167
+ for file_id in file_ids:
168
+ try:
169
+ await self.client.vector_stores.files.create(
170
+ vector_store_id=vector_store_id,
171
+ file_id=file_id,
172
+ )
173
+ logger.debug(
174
+ "Added file %s to vector store %s",
175
+ file_id,
176
+ vector_store_id,
177
+ )
178
+ except Exception as e:
179
+ logger.error(
180
+ "Failed to add file %s to vector store: %s",
181
+ file_id,
182
+ e,
183
+ )
184
+ raise FileUploadError(f"Failed to add file to vector store: {e}") from e
185
+
186
+ # Track in database
187
+ async with get_asyncdb_session() as session:
188
+ for file_id, filename, size in zip(
189
+ file_ids, filenames, file_sizes, strict=True
190
+ ):
191
+ upload_record = AssistantFileUpload(
192
+ filename=filename,
193
+ openai_file_id=file_id,
194
+ vector_store_id=vector_store_id,
195
+ vector_store_name=vector_store_name,
196
+ thread_id=thread_id,
197
+ user_id=user_id,
198
+ file_size=size,
199
+ )
200
+ session.add(upload_record)
201
+
202
+ await session.commit()
203
+ logger.debug(
204
+ "Tracked %d file uploads in database",
205
+ len(file_ids),
206
+ )
207
+
208
+ async def _validate_file_count(self, thread_id: int) -> None:
209
+ """Validate that adding another file won't exceed the limit."""
210
+ async with get_asyncdb_session() as session:
211
+ result = await session.execute(
212
+ select(AssistantFileUpload).where(
213
+ AssistantFileUpload.thread_id == thread_id
214
+ )
215
+ )
216
+ existing_count = len(result.scalars().all())
217
+
218
+ if existing_count >= self.config.max_files_per_thread:
219
+ raise FileUploadError(
220
+ f"Maximum files per thread ({self.config.max_files_per_thread}) "
221
+ "reached"
222
+ )
223
+
224
+ async def _upload_with_retry(self, path: Path, max_retries: int = 2) -> str:
225
+ """Upload file to OpenAI with retry logic.
226
+
227
+ Args:
228
+ path: Path to the file.
229
+ max_retries: Maximum number of attempts.
230
+
231
+ Returns:
232
+ The OpenAI file ID.
233
+
234
+ Raises:
235
+ FileUploadError: If all retries fail.
236
+ """
237
+ last_error: Exception | None = None
238
+
239
+ for attempt in range(max_retries):
240
+ try:
241
+ file_content = path.read_bytes()
242
+ vs_file = await self.client.files.create(
243
+ file=(path.name, file_content),
244
+ purpose="assistants",
245
+ )
246
+ return vs_file.id
247
+ except Exception as e:
248
+ last_error = e
249
+ logger.warning(
250
+ "File upload attempt %d failed: %s",
251
+ attempt + 1,
252
+ e,
253
+ )
254
+ if attempt < max_retries - 1:
255
+ await asyncio.sleep(1)
256
+
257
+ msg = f"Failed to upload file after {max_retries} attempts"
258
+ raise FileUploadError(msg) from last_error
259
+
260
+ async def _wait_for_processing( # noqa: PLR0912
261
+ self,
262
+ vector_store_id: str,
263
+ file_ids: list[str],
264
+ filenames: list[str],
265
+ max_wait_seconds: int = 60,
266
+ ) -> AsyncGenerator[Chunk, None]:
267
+ """Wait for files to be processed, yielding progress chunks in real-time.
268
+
269
+ Args:
270
+ vector_store_id: The vector store containing the files.
271
+ file_ids: List of file IDs to wait for.
272
+ filenames: List of original filenames for progress display.
273
+ max_wait_seconds: Maximum seconds to wait.
274
+
275
+ Yields:
276
+ Chunk objects with processing status updates.
277
+ """
278
+ if not file_ids:
279
+ return
280
+
281
+ # Map file IDs to filenames for display
282
+ file_id_to_name = dict(zip(file_ids, filenames, strict=True))
283
+ total_files = len(file_ids)
284
+ completed_count = 0
285
+
286
+ # Initial processing chunk
287
+ if total_files == 1:
288
+ initial_text = f"Indiziere: {filenames[0]}"
289
+ else:
290
+ initial_text = f"Indiziere {total_files} Dateien..."
291
+ yield self._chunk_factory.create(
292
+ ChunkType.PROCESSING,
293
+ initial_text,
294
+ {
295
+ "status": "indexing",
296
+ "total_files": total_files,
297
+ "completed_files": 0,
298
+ },
299
+ )
300
+
301
+ start_time = asyncio.get_event_loop().time()
302
+ pending_files = set(file_ids)
303
+ success = True
304
+
305
+ loop = asyncio.get_event_loop()
306
+ while pending_files and (loop.time() - start_time) < max_wait_seconds:
307
+ vs_files = await self.client.vector_stores.files.list(
308
+ vector_store_id=vector_store_id
309
+ )
310
+
311
+ for vs_file in vs_files.data:
312
+ if vs_file.id in pending_files:
313
+ if vs_file.status == "completed":
314
+ pending_files.discard(vs_file.id)
315
+ completed_count += 1
316
+ filename = file_id_to_name.get(vs_file.id, vs_file.id)
317
+ logger.debug("File indexed: %s", vs_file.id)
318
+
319
+ # Progress update chunk
320
+ progress_text = f"Indiziert: {filename}"
321
+ yield self._chunk_factory.create(
322
+ ChunkType.PROCESSING,
323
+ progress_text,
324
+ {
325
+ "status": "progress",
326
+ "total_files": total_files,
327
+ "completed_files": completed_count,
328
+ "current_file": filename,
329
+ },
330
+ )
331
+ elif vs_file.status in ("failed", "cancelled"):
332
+ error_msg = ""
333
+ if vs_file.last_error:
334
+ error_msg = vs_file.last_error.message
335
+ logger.error(
336
+ "File indexing failed: %s - %s",
337
+ vs_file.id,
338
+ error_msg,
339
+ )
340
+ failed_name = file_id_to_name.get(vs_file.id, vs_file.id)
341
+ yield self._chunk_factory.create(
342
+ ChunkType.PROCESSING,
343
+ f"Fehler: {failed_name}",
344
+ {
345
+ "status": "failed",
346
+ "total_files": total_files,
347
+ "completed_files": completed_count,
348
+ "error": error_msg,
349
+ },
350
+ )
351
+ pending_files.discard(vs_file.id)
352
+ success = False
353
+
354
+ if pending_files:
355
+ await asyncio.sleep(1)
356
+
357
+ if pending_files:
358
+ logger.warning("Timeout waiting for files: %s", pending_files)
359
+ yield self._chunk_factory.create(
360
+ ChunkType.PROCESSING,
361
+ f"Zeitüberschreitung ({completed_count}/{total_files})",
362
+ {
363
+ "status": "timeout",
364
+ "total_files": total_files,
365
+ "completed_files": completed_count,
366
+ },
367
+ )
368
+ return
369
+
370
+ # Final success chunk
371
+ if success:
372
+ if total_files == 1:
373
+ done_text = f"Bereit: {filenames[0]}"
374
+ else:
375
+ done_text = f"{total_files} Dateien bereit"
376
+ yield self._chunk_factory.create(
377
+ ChunkType.PROCESSING,
378
+ done_text,
379
+ {
380
+ "status": "completed",
381
+ "total_files": total_files,
382
+ "completed_files": total_files,
383
+ },
384
+ )
385
+
386
+ async def _create_vector_store_with_retry(
387
+ self,
388
+ thread_uuid: str,
389
+ max_retries: int = 2,
390
+ ) -> Any:
391
+ """Create vector store with retry logic.
392
+
393
+ Args:
394
+ thread_uuid: Thread UUID for naming the store.
395
+ max_retries: Maximum number of attempts.
396
+
397
+ Returns:
398
+ The created vector store object.
399
+
400
+ Raises:
401
+ FileUploadError: If all retries fail.
402
+ """
403
+ last_error: Exception | None = None
404
+
405
+ for attempt in range(max_retries):
406
+ try:
407
+ return await self.client.vector_stores.create(
408
+ name=f"Thread-{thread_uuid}",
409
+ expires_after={
410
+ "anchor": "last_active_at",
411
+ "days": self.config.vector_store_expiration_days,
412
+ },
413
+ )
414
+ except Exception as e:
415
+ last_error = e
416
+ logger.warning(
417
+ "Vector store creation attempt %d failed: %s",
418
+ attempt + 1,
419
+ e,
420
+ )
421
+ if attempt < max_retries - 1:
422
+ await asyncio.sleep(1)
423
+
424
+ raise FileUploadError(
425
+ f"Failed to create vector store after {max_retries} attempts"
426
+ ) from last_error
427
+
428
+ async def _delete_files_from_vector_stores(
429
+ self, db_files: list[AssistantFileUpload]
430
+ ) -> None:
431
+ """Delete files FROM their vector stores (Level 1)."""
432
+ # Build map of vector_store_id -> file_ids
433
+ vector_store_files: dict[str, list[str]] = {}
434
+ for db_file in db_files:
435
+ if db_file.vector_store_id:
436
+ if db_file.vector_store_id not in vector_store_files:
437
+ vector_store_files[db_file.vector_store_id] = []
438
+ vector_store_files[db_file.vector_store_id].append(
439
+ db_file.openai_file_id
440
+ )
441
+
442
+ # Delete from each vector store
443
+ for vs_id, vs_file_ids in vector_store_files.items():
444
+ for file_id in vs_file_ids:
445
+ try:
446
+ await self.client.vector_stores.files.delete(
447
+ vector_store_id=vs_id,
448
+ file_id=file_id,
449
+ )
450
+ logger.debug("Deleted file %s from vector store %s", file_id, vs_id)
451
+ except Exception as e:
452
+ logger.warning(
453
+ "Failed to delete file %s from vector store %s: %s",
454
+ file_id,
455
+ vs_id,
456
+ e,
457
+ )
458
+
459
+ async def _delete_files_from_openai(self, file_ids: list[str]) -> dict[str, bool]:
460
+ """Delete files from OpenAI (Level 2)."""
461
+ results = {}
462
+ for file_id in file_ids:
463
+ try:
464
+ await self.client.files.delete(file_id=file_id)
465
+ logger.debug("Deleted OpenAI file: %s", file_id)
466
+ results[file_id] = True
467
+ except Exception as e:
468
+ logger.warning("Failed to delete OpenAI file %s: %s", file_id, e)
469
+ results[file_id] = False
470
+ return results
471
+
472
+ async def _delete_file_db_records(
473
+ self,
474
+ db_files: list[AssistantFileUpload],
475
+ deletion_results: dict[str, bool],
476
+ ) -> None:
477
+ """Delete database records for successfully deleted files (Level 3)."""
478
+ deleted_file_ids = [fid for fid, success in deletion_results.items() if success]
479
+ if not deleted_file_ids:
480
+ return
481
+
482
+ async with get_asyncdb_session() as session:
483
+ for db_file in db_files:
484
+ if db_file.openai_file_id in deleted_file_ids:
485
+ try:
486
+ await session.delete(db_file)
487
+ logger.debug(
488
+ "Deleted DB record for file: %s", db_file.openai_file_id
489
+ )
490
+ except Exception as e:
491
+ logger.warning(
492
+ "Failed to delete DB record for file %s: %s",
493
+ db_file.openai_file_id,
494
+ e,
495
+ )
496
+ await session.commit()
497
+
498
+ async def upload_file(
499
+ self,
500
+ file_path: str,
501
+ thread_id: int,
502
+ user_id: int, # noqa: ARG002
503
+ ) -> str:
504
+ """Upload a file to OpenAI for assistants/file_search.
505
+
506
+ Args:
507
+ file_path: Local path to the file to upload.
508
+ thread_id: Database ID of the thread this file belongs to.
509
+ user_id: ID of the user uploading the file.
510
+
511
+ Returns:
512
+ The OpenAI file ID.
513
+
514
+ Raises:
515
+ FileUploadError: If validation fails or upload errors occur.
516
+ """
517
+ path = Path(file_path)
518
+
519
+ # Validate file exists
520
+ if not path.exists():
521
+ raise FileUploadError(f"Datei nicht gefunden: {file_path}")
522
+
523
+ # Validate file size
524
+ file_size = path.stat().st_size
525
+ if file_size > self._max_file_size_bytes:
526
+ raise FileUploadError(
527
+ f"Datei überschreitet die maximale Größe von {self.config.max_file_size_mb}MB"
528
+ )
529
+
530
+ # Validate file count for thread
531
+ await self._validate_file_count(thread_id)
532
+
533
+ # Upload to OpenAI with retry
534
+ openai_file_id = await self._upload_with_retry(path)
535
+
536
+ logger.info(
537
+ "Uploaded file to OpenAI: %s -> %s",
538
+ path.name,
539
+ openai_file_id,
540
+ )
541
+
542
+ return openai_file_id
543
+
544
+ async def process_files( # noqa: PLR0912
545
+ self,
546
+ file_paths: list[str],
547
+ thread_db_id: int,
548
+ thread_uuid: str,
549
+ user_id: int,
550
+ ) -> AsyncGenerator[Chunk, None]:
551
+ """Process files for a thread, yielding progress chunks in real-time.
552
+
553
+ Uploads files, creates/gets vector store, adds files to it, and waits
554
+ for indexing - yielding progress updates as each step happens.
555
+
556
+ Final chunk has metadata with 'vector_store_id' key.
557
+
558
+ Args:
559
+ file_paths: List of local file paths to process.
560
+ thread_db_id: Database ID of the thread.
561
+ thread_uuid: UUID string of the thread.
562
+ user_id: ID of the user.
563
+
564
+ Yields:
565
+ Chunk objects with real-time progress updates.
566
+ Final chunk contains 'vector_store_id' in metadata.
567
+
568
+ Raises:
569
+ FileUploadError: If any step fails (with cleanup of uploaded files).
570
+ """
571
+ if not file_paths:
572
+ return
573
+
574
+ uploaded_file_ids: list[str] = []
575
+ filenames: list[str] = []
576
+ file_sizes: list[int] = []
577
+ total_files = len(file_paths)
578
+ vector_store_id: str | None = None
579
+
580
+ try:
581
+ # Phase 1: Upload files to OpenAI
582
+ for i, file_path in enumerate(file_paths, 1):
583
+ path = Path(file_path)
584
+ filename = path.name
585
+
586
+ yield self._chunk_factory.create(
587
+ ChunkType.PROCESSING,
588
+ f"Lade hoch: {filename} ({i}/{total_files})",
589
+ {
590
+ "status": "uploading",
591
+ "current_file": filename,
592
+ "file_index": i,
593
+ "total_files": total_files,
594
+ },
595
+ )
596
+
597
+ file_id = await self.upload_file(file_path, thread_db_id, user_id)
598
+ uploaded_file_ids.append(file_id)
599
+ filenames.append(filename)
600
+ file_sizes.append(path.stat().st_size)
601
+
602
+ # Phase 2: Get or create vector store
603
+ yield self._chunk_factory.create(
604
+ ChunkType.PROCESSING,
605
+ "Bereite Vector Store vor...",
606
+ {"status": "preparing_store"},
607
+ )
608
+
609
+ vector_store_id, vector_store_name = await self.get_vector_store(
610
+ thread_db_id, thread_uuid
611
+ )
612
+
613
+ # Phase 3: Add files to vector store
614
+ for i, (file_id, filename) in enumerate(
615
+ zip(uploaded_file_ids, filenames, strict=True), 1
616
+ ):
617
+ yield self._chunk_factory.create(
618
+ ChunkType.PROCESSING,
619
+ f"Füge hinzu: {filename} ({i}/{total_files})",
620
+ {
621
+ "status": "adding_to_store",
622
+ "current_file": filename,
623
+ "file_index": i,
624
+ "total_files": total_files,
625
+ },
626
+ )
627
+
628
+ await self.client.vector_stores.files.create(
629
+ vector_store_id=vector_store_id,
630
+ file_id=file_id,
631
+ )
632
+ logger.debug(
633
+ "Added file %s to vector store %s", file_id, vector_store_id
634
+ )
635
+
636
+ # Track in database
637
+ async with get_asyncdb_session() as session:
638
+ for file_id, filename, size in zip(
639
+ uploaded_file_ids, filenames, file_sizes, strict=True
640
+ ):
641
+ upload_record = AssistantFileUpload(
642
+ filename=filename,
643
+ openai_file_id=file_id,
644
+ vector_store_id=vector_store_id,
645
+ vector_store_name=vector_store_name,
646
+ thread_id=thread_db_id,
647
+ user_id=user_id,
648
+ file_size=size,
649
+ )
650
+ session.add(upload_record)
651
+ await session.commit()
652
+ logger.debug("Tracked %d file uploads in database", len(filenames))
653
+
654
+ # Phase 4: Wait for indexing with streaming progress
655
+ async for chunk in self._wait_for_processing(
656
+ vector_store_id, uploaded_file_ids, filenames
657
+ ):
658
+ # Add vector_store_id to final chunk metadata
659
+ if chunk.chunk_metadata and chunk.chunk_metadata.get("status") in (
660
+ "completed",
661
+ "timeout",
662
+ "failed",
663
+ ):
664
+ chunk.chunk_metadata["vector_store_id"] = vector_store_id
665
+ yield chunk
666
+
667
+ except Exception:
668
+ # Cleanup uploaded files on failure
669
+ if uploaded_file_ids:
670
+ logger.warning(
671
+ "Cleaning up %d uploaded files due to error",
672
+ len(uploaded_file_ids),
673
+ )
674
+ await self.delete_files(uploaded_file_ids)
675
+ raise
676
+
677
+ async def get_vector_store(
678
+ self,
679
+ thread_id: int,
680
+ thread_uuid: str,
681
+ ) -> tuple[str, str]:
682
+ """Get existing vector store for thread or create a new one.
683
+
684
+ Automatically validates the vector store exists in OpenAI and recreates
685
+ if missing.
686
+
687
+ Args:
688
+ thread_id: Database ID of the thread.
689
+ thread_uuid: UUID string of the thread (for naming).
690
+
691
+ Returns:
692
+ Tuple of (vector_store_id, vector_store_name).
693
+
694
+ Raises:
695
+ FileUploadError: If vector store creation fails.
696
+ """
697
+ async with get_asyncdb_session() as session:
698
+ result = await session.execute(
699
+ select(AssistantThread).where(AssistantThread.id == thread_id)
700
+ )
701
+ thread = result.scalar_one_or_none()
702
+
703
+ if not thread:
704
+ raise FileUploadError(f"Thread not found: {thread_id}")
705
+
706
+ # Return existing vector store if present and valid
707
+ if thread.vector_store_id:
708
+ logger.debug(
709
+ "Checking existing vector store: %s",
710
+ thread.vector_store_id,
711
+ )
712
+ # Validate vector store exists in OpenAI
713
+ try:
714
+ vs = await self.client.vector_stores.retrieve(
715
+ thread.vector_store_id
716
+ )
717
+ return thread.vector_store_id, vs.name or ""
718
+ except Exception as e:
719
+ error_msg = str(e).lower()
720
+ if "not found" in error_msg or "404" in error_msg:
721
+ logger.warning(
722
+ "Vector store %s no longer exists, creating new one",
723
+ thread.vector_store_id,
724
+ )
725
+ # Vector store doesn't exist - create new one and migrate files
726
+ return await self._recreate_vector_store(
727
+ session, thread, thread_uuid
728
+ )
729
+ # Other error - log but try to continue
730
+ logger.warning(
731
+ "Error checking vector store %s: %s",
732
+ thread.vector_store_id,
733
+ e,
734
+ )
735
+ return thread.vector_store_id, f"Thread-{thread_uuid}"
736
+
737
+ # Create new vector store with expiration
738
+ vector_store = await self._create_vector_store_with_retry(thread_uuid)
739
+ vector_store_id = vector_store.id
740
+ vector_store_name = vector_store.name or f"Thread-{thread_uuid}"
741
+
742
+ # Update thread with vector store ID
743
+ thread.vector_store_id = vector_store_id
744
+ session.add(thread)
745
+ await session.commit()
746
+
747
+ logger.info(
748
+ "Created vector store for thread %s: %s",
749
+ thread_uuid,
750
+ vector_store_id,
751
+ )
752
+
753
+ return vector_store_id, vector_store_name
754
+
755
+ async def delete_files(self, file_ids: list[str]) -> dict[str, bool]:
756
+ """Delete files with proper three-level ordering.
757
+
758
+ Order:
759
+ 1. Delete files FROM their vector stores (remove association)
760
+ 2. Delete files from OpenAI
761
+ 3. Delete database records
762
+
763
+ Args:
764
+ file_ids: List of OpenAI file IDs to delete.
765
+
766
+ Returns:
767
+ Dictionary mapping file_id to deletion success status.
768
+ """
769
+ if not file_ids:
770
+ return {}
771
+
772
+ # Get file records from database to know which vector stores they belong to
773
+ async with get_asyncdb_session() as session:
774
+ file_records = await session.execute(
775
+ select(AssistantFileUpload).where(
776
+ AssistantFileUpload.openai_file_id.in_(file_ids)
777
+ )
778
+ )
779
+ db_files = file_records.scalars().all()
780
+
781
+ # LEVEL 1: Delete files FROM their vector stores
782
+ await self._delete_files_from_vector_stores(db_files)
783
+
784
+ # LEVEL 2: Delete files from OpenAI
785
+ results = await self._delete_files_from_openai(file_ids)
786
+
787
+ # LEVEL 3: Delete database records (only for successfully deleted files)
788
+ await self._delete_file_db_records(db_files, results)
789
+
790
+ return results
791
+
792
+ async def delete_vector_store(self, vector_store_id: str) -> bool:
793
+ """Delete a vector store with proper ordering.
794
+
795
+ Order:
796
+ 1. Delete all files in the vector store (via delete_files - 3-level deletion)
797
+ 2. Delete the vector store container itself
798
+
799
+ Args:
800
+ vector_store_id: The vector store ID to delete.
801
+
802
+ Returns:
803
+ True if vector store was successfully deleted, False otherwise.
804
+ """
805
+ if not vector_store_id:
806
+ return False
807
+
808
+ logger.info("Deleting vector store: %s", vector_store_id)
809
+
810
+ # Step 1: List and delete all files in the vector store
811
+ try:
812
+ vs_files = await self.client.vector_stores.files.list(
813
+ vector_store_id=vector_store_id
814
+ )
815
+ file_ids = [vs_file.id for vs_file in vs_files.data]
816
+
817
+ if file_ids:
818
+ logger.info(
819
+ "Deleting %d files from vector store %s",
820
+ len(file_ids),
821
+ vector_store_id,
822
+ )
823
+ deletion_results = await self.delete_files(file_ids)
824
+ successful = sum(1 for success in deletion_results.values() if success)
825
+ logger.info(
826
+ "Successfully deleted %d/%d files from vector store %s",
827
+ successful,
828
+ len(file_ids),
829
+ vector_store_id,
830
+ )
831
+ except Exception as e:
832
+ logger.warning(
833
+ "Failed to delete files from vector store %s: %s",
834
+ vector_store_id,
835
+ e,
836
+ )
837
+
838
+ # Step 2: Delete the vector store container itself
839
+ try:
840
+ await self.client.vector_stores.delete(vector_store_id=vector_store_id)
841
+ logger.info("Deleted vector store: %s", vector_store_id)
842
+ return True
843
+ except Exception as e:
844
+ logger.warning(
845
+ "Failed to delete vector store %s (will auto-expire): %s",
846
+ vector_store_id,
847
+ e,
848
+ )
849
+ return False
850
+
851
+ async def cleanup_deleted_thread(
852
+ self,
853
+ thread_db_id: int,
854
+ vector_store_id: str | None,
855
+ ) -> dict[str, Any]:
856
+ """Clean up all resources for a deleted thread.
857
+
858
+ Deletion is handled by delete_vector_store which:
859
+ 1. Deletes all files (3-level: from VS, from OpenAI, from DB)
860
+ 2. Deletes the vector store container
861
+
862
+ Args:
863
+ thread_db_id: Database ID of the deleted thread.
864
+ vector_store_id: The vector store ID (if any) associated with the thread.
865
+
866
+ Returns:
867
+ Dictionary with cleanup statistics:
868
+ {
869
+ 'vector_store_deleted': bool,
870
+ 'thread_db_id': int,
871
+ 'errors': list[str]
872
+ }
873
+ """
874
+ logger.info("Starting cleanup for deleted thread %d", thread_db_id)
875
+
876
+ result = {
877
+ "vector_store_deleted": False,
878
+ "thread_db_id": thread_db_id,
879
+ "errors": [],
880
+ }
881
+
882
+ if not vector_store_id:
883
+ logger.debug(
884
+ "No vector store to clean up for thread %d",
885
+ thread_db_id,
886
+ )
887
+ return result
888
+
889
+ # Delete vector store (which handles all file deletion internally)
890
+ vs_deleted = await self.delete_vector_store(vector_store_id)
891
+ result["vector_store_deleted"] = vs_deleted
892
+
893
+ if not vs_deleted:
894
+ result["errors"].append(f"Failed to delete vector store {vector_store_id}")
895
+
896
+ logger.info(
897
+ "Cleanup completed for thread %d: VS=%s, errors=%d",
898
+ thread_db_id,
899
+ result["vector_store_deleted"],
900
+ len(result["errors"]),
901
+ )
902
+
903
+ return result