admin-api-lib 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. admin_api_lib/__init__.py +0 -0
  2. admin_api_lib/api_endpoints/document_deleter.py +24 -0
  3. admin_api_lib/api_endpoints/document_reference_retriever.py +25 -0
  4. admin_api_lib/api_endpoints/documents_status_retriever.py +20 -0
  5. admin_api_lib/api_endpoints/file_uploader.py +31 -0
  6. admin_api_lib/api_endpoints/source_uploader.py +40 -0
  7. admin_api_lib/api_endpoints/uploader_base.py +30 -0
  8. admin_api_lib/apis/__init__.py +0 -0
  9. admin_api_lib/apis/admin_api.py +197 -0
  10. admin_api_lib/apis/admin_api_base.py +120 -0
  11. admin_api_lib/chunker/__init__.py +0 -0
  12. admin_api_lib/chunker/chunker.py +25 -0
  13. admin_api_lib/dependency_container.py +236 -0
  14. admin_api_lib/extractor_api_client/__init__.py +0 -0
  15. admin_api_lib/extractor_api_client/openapi_client/__init__.py +38 -0
  16. admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +4 -0
  17. admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +516 -0
  18. admin_api_lib/extractor_api_client/openapi_client/api_client.py +695 -0
  19. admin_api_lib/extractor_api_client/openapi_client/api_response.py +20 -0
  20. admin_api_lib/extractor_api_client/openapi_client/configuration.py +460 -0
  21. admin_api_lib/extractor_api_client/openapi_client/exceptions.py +197 -0
  22. admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +21 -0
  23. admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +34 -0
  24. admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +103 -0
  25. admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +82 -0
  26. admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +104 -0
  27. admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +92 -0
  28. admin_api_lib/extractor_api_client/openapi_client/rest.py +209 -0
  29. admin_api_lib/extractor_api_client/openapi_client/test/__init__.py +0 -0
  30. admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +35 -0
  31. admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py +59 -0
  32. admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +56 -0
  33. admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +39 -0
  34. admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +62 -0
  35. admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +54 -0
  36. admin_api_lib/file_services/file_service.py +77 -0
  37. admin_api_lib/impl/__init__.py +0 -0
  38. admin_api_lib/impl/admin_api.py +167 -0
  39. admin_api_lib/impl/api_endpoints/default_document_deleter.py +84 -0
  40. admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py +72 -0
  41. admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py +41 -0
  42. admin_api_lib/impl/api_endpoints/default_file_uploader.py +234 -0
  43. admin_api_lib/impl/api_endpoints/default_source_uploader.py +202 -0
  44. admin_api_lib/impl/chunker/__init__.py +0 -0
  45. admin_api_lib/impl/chunker/chunker_type.py +11 -0
  46. admin_api_lib/impl/chunker/semantic_text_chunker.py +252 -0
  47. admin_api_lib/impl/chunker/text_chunker.py +33 -0
  48. admin_api_lib/impl/file_services/__init__.py +0 -0
  49. admin_api_lib/impl/file_services/s3_service.py +130 -0
  50. admin_api_lib/impl/information_enhancer/__init__.py +0 -0
  51. admin_api_lib/impl/information_enhancer/general_enhancer.py +52 -0
  52. admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +62 -0
  53. admin_api_lib/impl/information_enhancer/summary_enhancer.py +74 -0
  54. admin_api_lib/impl/key_db/__init__.py +0 -0
  55. admin_api_lib/impl/key_db/file_status_key_value_store.py +111 -0
  56. admin_api_lib/impl/mapper/informationpiece2document.py +108 -0
  57. admin_api_lib/impl/settings/__init__.py +0 -0
  58. admin_api_lib/impl/settings/chunker_class_type_settings.py +18 -0
  59. admin_api_lib/impl/settings/chunker_settings.py +29 -0
  60. admin_api_lib/impl/settings/document_extractor_settings.py +21 -0
  61. admin_api_lib/impl/settings/key_value_settings.py +26 -0
  62. admin_api_lib/impl/settings/rag_api_settings.py +21 -0
  63. admin_api_lib/impl/settings/s3_settings.py +31 -0
  64. admin_api_lib/impl/settings/source_uploader_settings.py +23 -0
  65. admin_api_lib/impl/settings/summarizer_settings.py +86 -0
  66. admin_api_lib/impl/summarizer/__init__.py +0 -0
  67. admin_api_lib/impl/summarizer/langchain_summarizer.py +117 -0
  68. admin_api_lib/information_enhancer/__init__.py +0 -0
  69. admin_api_lib/information_enhancer/information_enhancer.py +34 -0
  70. admin_api_lib/main.py +54 -0
  71. admin_api_lib/models/__init__.py +0 -0
  72. admin_api_lib/models/document_status.py +86 -0
  73. admin_api_lib/models/extra_models.py +9 -0
  74. admin_api_lib/models/http_validation_error.py +105 -0
  75. admin_api_lib/models/key_value_pair.py +85 -0
  76. admin_api_lib/models/status.py +44 -0
  77. admin_api_lib/models/validation_error.py +104 -0
  78. admin_api_lib/models/validation_error_loc_inner.py +114 -0
  79. admin_api_lib/prompt_templates/__init__.py +0 -0
  80. admin_api_lib/prompt_templates/summarize_prompt.py +14 -0
  81. admin_api_lib/rag_backend_client/__init__.py +0 -0
  82. admin_api_lib/rag_backend_client/openapi_client/__init__.py +60 -0
  83. admin_api_lib/rag_backend_client/openapi_client/api/__init__.py +4 -0
  84. admin_api_lib/rag_backend_client/openapi_client/api/rag_api.py +968 -0
  85. admin_api_lib/rag_backend_client/openapi_client/api_client.py +698 -0
  86. admin_api_lib/rag_backend_client/openapi_client/api_response.py +22 -0
  87. admin_api_lib/rag_backend_client/openapi_client/configuration.py +460 -0
  88. admin_api_lib/rag_backend_client/openapi_client/exceptions.py +197 -0
  89. admin_api_lib/rag_backend_client/openapi_client/models/__init__.py +41 -0
  90. admin_api_lib/rag_backend_client/openapi_client/models/chat_history.py +99 -0
  91. admin_api_lib/rag_backend_client/openapi_client/models/chat_history_message.py +83 -0
  92. admin_api_lib/rag_backend_client/openapi_client/models/chat_request.py +93 -0
  93. admin_api_lib/rag_backend_client/openapi_client/models/chat_response.py +103 -0
  94. admin_api_lib/rag_backend_client/openapi_client/models/chat_role.py +35 -0
  95. admin_api_lib/rag_backend_client/openapi_client/models/content_type.py +37 -0
  96. admin_api_lib/rag_backend_client/openapi_client/models/delete_request.py +99 -0
  97. admin_api_lib/rag_backend_client/openapi_client/models/information_piece.py +110 -0
  98. admin_api_lib/rag_backend_client/openapi_client/models/key_value_pair.py +83 -0
  99. admin_api_lib/rag_backend_client/openapi_client/rest.py +209 -0
  100. admin_api_lib/summarizer/__init__.py +0 -0
  101. admin_api_lib/summarizer/summarizer.py +33 -0
  102. admin_api_lib/utils/__init__.py +0 -0
  103. admin_api_lib/utils/utils.py +32 -0
  104. admin_api_lib-3.2.0.dist-info/METADATA +24 -0
  105. admin_api_lib-3.2.0.dist-info/RECORD +106 -0
  106. admin_api_lib-3.2.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,41 @@
1
+ """Module for the DefaultDocumentsStatusRetriever class."""
2
+
3
+ import logging
4
+
5
+ from admin_api_lib.api_endpoints.documents_status_retriever import (
6
+ DocumentsStatusRetriever,
7
+ )
8
+ from admin_api_lib.impl.key_db.file_status_key_value_store import (
9
+ FileStatusKeyValueStore,
10
+ )
11
+ from admin_api_lib.models.document_status import DocumentStatus
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class DefaultDocumentsStatusRetriever(DocumentsStatusRetriever):
17
+ """The DefaultDocumentsStatusRetriever retrieves the status of all documents from a key-value store."""
18
+
19
+ def __init__(self, key_value_store: FileStatusKeyValueStore):
20
+ """
21
+ Initialize the DefaultDocumentsStatusRetriever.
22
+
23
+ Parameters
24
+ ----------
25
+ key_value_store : FileStatusKeyValueStore
26
+ The key-value store for storing filename and the corresponding status.
27
+ """
28
+ self._key_value_store = key_value_store
29
+
30
+ async def aget_all_documents_status(self) -> list[DocumentStatus]:
31
+ """
32
+ Asynchronously retrieves the status of all documents.
33
+
34
+ Returns
35
+ -------
36
+ list[DocumentStatus]
37
+ A list containing the status of all documents, where each document's
38
+ status is represented by a DocumentStatus object.
39
+ """
40
+ all_documents = self._key_value_store.get_all()
41
+ return [DocumentStatus(name=x[0], status=x[1]) for x in all_documents]
@@ -0,0 +1,234 @@
1
+ import logging
2
+ from pathlib import Path
3
+ import traceback
4
+ import urllib
5
+ import tempfile
6
+ import asyncio
7
+ from contextlib import suppress
8
+
9
+ from fastapi import UploadFile, status, HTTPException
10
+ from langchain_core.documents import Document
11
+
12
+ from admin_api_lib.file_services.file_service import FileService
13
+ from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest
14
+ from admin_api_lib.api_endpoints.file_uploader import FileUploader
15
+ from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi
16
+ from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi
17
+ from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document
18
+ from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter
19
+ from admin_api_lib.chunker.chunker import Chunker
20
+ from admin_api_lib.models.status import Status
21
+ from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore
22
+ from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer
23
+ from admin_api_lib.utils.utils import sanitize_document_name
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class DefaultFileUploader(FileUploader):
29
+ """The DefaultFileUploader is responsible for adding a new source file document to the available content."""
30
+
31
+ def __init__(
32
+ self,
33
+ extractor_api: ExtractorApi,
34
+ key_value_store: FileStatusKeyValueStore,
35
+ information_enhancer: InformationEnhancer,
36
+ chunker: Chunker,
37
+ document_deleter: DocumentDeleter,
38
+ rag_api: RagApi,
39
+ information_mapper: InformationPiece2Document,
40
+ file_service: FileService,
41
+ ):
42
+ """
43
+ Initialize the DefaultFileUploader.
44
+
45
+ Parameters
46
+ ----------
47
+ extractor_api : ExtractorApi
48
+ Client for the Extraction service.
49
+ key_value_store : FileStatusKeyValueStore
50
+ The key-value store for storing filename and the corresponding status.
51
+ information_enhancer : InformationEnhancer
52
+ The service for enhancing information.
53
+ chunker : Chunker
54
+ The service for chunking documents into chunks.
55
+ document_deleter : DocumentDeleter
56
+ The service for deleting documents.
57
+ rag_api : RagApi
58
+ The API for RAG backend.
59
+ information_mapper : InformationPiece2Document
60
+ The mapper for converting information pieces to langchain documents.
61
+ file_service : FileService
62
+ The service for handling file operations on the S3 storage
63
+ """
64
+ super().__init__()
65
+ self._extractor_api = extractor_api
66
+ self._rag_api = rag_api
67
+ self._key_value_store = key_value_store
68
+ self._information_mapper = information_mapper
69
+ self._information_enhancer = information_enhancer
70
+ self._chunker = chunker
71
+ self._document_deleter = document_deleter
72
+ self._background_tasks = []
73
+ self._file_service = file_service
74
+
75
+ async def upload_file(
76
+ self,
77
+ base_url: str,
78
+ file: UploadFile,
79
+ ) -> None:
80
+ """
81
+ Uploads a source file for content extraction.
82
+
83
+ Parameters
84
+ ----------
85
+ base_url : str
86
+ The base url of the service. Is used to determine the download link of the file.
87
+ file : UploadFile
88
+ The file to process.
89
+
90
+ Returns
91
+ -------
92
+ None
93
+ """
94
+ self._prune_background_tasks()
95
+
96
+ try:
97
+ file.filename = sanitize_document_name(file.filename)
98
+ source_name = f"file:{sanitize_document_name(file.filename)}"
99
+ self._check_if_already_in_processing(source_name)
100
+ self._key_value_store.upsert(source_name, Status.PROCESSING)
101
+ content = await file.read()
102
+ s3_path = await self._asave_new_document(content, file.filename, source_name)
103
+
104
+ task = asyncio.create_task(self._handle_source_upload(s3_path, source_name, file.filename, base_url))
105
+ task.add_done_callback(self._log_task_exception)
106
+ self._background_tasks.append(task)
107
+ except ValueError as e:
108
+ self._key_value_store.upsert(source_name, Status.ERROR)
109
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
110
+ except Exception as e:
111
+ self._key_value_store.upsert(source_name, Status.ERROR)
112
+ logger.error("Error while uploading %s = %s", source_name, str(e))
113
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
114
+
115
+ def _log_task_exception(self, task: asyncio.Task) -> None:
116
+ """
117
+ Log exceptions from completed background tasks.
118
+
119
+ Parameters
120
+ ----------
121
+ task : asyncio.Task
122
+ The completed task to check for exceptions.
123
+ """
124
+ if task.done() and not task.cancelled():
125
+ try:
126
+ task.result() # This will raise the exception if one occurred
127
+ except Exception as e:
128
+ logger.error("Background task failed with exception: %s", str(e))
129
+ logger.debug("Background task exception traceback: %s", traceback.format_exc())
130
+
131
+ def _prune_background_tasks(self) -> None:
132
+ """
133
+ Remove completed background tasks from the list.
134
+ """
135
+ self._background_tasks = [task for task in self._background_tasks if not task.done()]
136
+
137
+ def _check_if_already_in_processing(self, source_name: str) -> None:
138
+ """
139
+ Checks if the source is already in processing state.
140
+
141
+ Parameters
142
+ ----------
143
+ source_name : str
144
+ The name of the source.
145
+
146
+ Returns
147
+ -------
148
+ None
149
+
150
+ Raises
151
+ ------
152
+ ValueError
153
+ If the source is already in processing state.
154
+ """
155
+ existing = [s for name, s in self._key_value_store.get_all() if name == source_name]
156
+ if any(s == Status.PROCESSING for s in existing):
157
+ raise ValueError(f"Document {source_name} is already in processing state")
158
+
159
+ async def _handle_source_upload(
160
+ self,
161
+ s3_path: Path,
162
+ source_name: str,
163
+ file_name: str,
164
+ base_url: str,
165
+ ):
166
+ try:
167
+ # Run blocking extractor API call in thread pool to avoid blocking event loop
168
+ information_pieces = await asyncio.to_thread(
169
+ self._extractor_api.extract_from_file_post,
170
+ ExtractionRequest(path_on_s3=str(s3_path), document_name=source_name),
171
+ )
172
+
173
+ if not information_pieces:
174
+ self._key_value_store.upsert(source_name, Status.ERROR)
175
+ logger.error("No information pieces found in the document: %s", source_name)
176
+ raise Exception("No information pieces found")
177
+ documents: list[Document] = []
178
+ for piece in information_pieces:
179
+ documents.append(self._information_mapper.extractor_information_piece2document(piece))
180
+
181
+ # Run blocking chunker call in thread pool to avoid blocking event loop
182
+ chunked_documents = await asyncio.to_thread(self._chunker.chunk, documents)
183
+
184
+ enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents)
185
+ self._add_file_url(file_name, base_url, enhanced_documents)
186
+
187
+ rag_information_pieces = [
188
+ self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents
189
+ ]
190
+ # Replace old document
191
+ # deletion is allowed to fail
192
+ with suppress(Exception):
193
+ await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False)
194
+
195
+ # Run blocking RAG API call in thread pool to avoid blocking event loop
196
+ await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
197
+ self._key_value_store.upsert(source_name, Status.READY)
198
+ logger.info("Source uploaded successfully: %s", source_name)
199
+ except Exception as e:
200
+ self._key_value_store.upsert(source_name, Status.ERROR)
201
+ logger.error("Error while uploading %s = %s", source_name, str(e))
202
+
203
+ def _add_file_url(self, file_name: str, base_url: str, chunked_documents: list[Document]):
204
+ document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file_name)}"
205
+ for idx, chunk in enumerate(chunked_documents):
206
+ if chunk.metadata["id"] in chunk.metadata["related"]:
207
+ chunk.metadata["related"].remove(chunk.metadata["id"])
208
+ chunk.metadata.update(
209
+ {
210
+ "chunk": idx,
211
+ "chunk_length": len(chunk.page_content),
212
+ "document_url": document_url,
213
+ }
214
+ )
215
+
216
+ async def _asave_new_document(
217
+ self,
218
+ file_content: bytes,
219
+ filename: str,
220
+ source_name: str,
221
+ ) -> Path:
222
+ try:
223
+ with tempfile.TemporaryDirectory() as temp_dir:
224
+ temp_file_path = Path(temp_dir) / filename
225
+ with open(temp_file_path, "wb") as temp_file:
226
+ logger.debug("Temporary file created at %s.", temp_file_path)
227
+ temp_file.write(file_content)
228
+ logger.debug("Temp file created and content written.")
229
+
230
+ self._file_service.upload_file(Path(temp_file_path), filename)
231
+ return filename
232
+ except Exception as e:
233
+ logger.error("Error during document saving: %s %s", e, traceback.format_exc())
234
+ self._key_value_store.upsert(source_name, Status.ERROR)
@@ -0,0 +1,202 @@
1
+ import logging
2
+ import asyncio
3
+ from threading import Thread
4
+ from contextlib import suppress
5
+
6
+ from pydantic import StrictStr
7
+ from fastapi import status, HTTPException
8
+ from langchain_core.documents import Document
9
+
10
+ from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi
11
+ from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters
12
+ from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings
13
+ from admin_api_lib.models.key_value_pair import KeyValuePair
14
+ from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi
15
+ from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document
16
+ from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter
17
+ from admin_api_lib.api_endpoints.source_uploader import SourceUploader
18
+ from admin_api_lib.chunker.chunker import Chunker
19
+ from admin_api_lib.models.status import Status
20
+ from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore
21
+ from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer
22
+ from admin_api_lib.utils.utils import sanitize_document_name
23
+ from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import (
24
+ InformationPiece as RagInformationPiece,
25
+ )
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class DefaultSourceUploader(SourceUploader):
31
+
32
+ def __init__(
33
+ self,
34
+ extractor_api: ExtractorApi,
35
+ key_value_store: FileStatusKeyValueStore,
36
+ information_enhancer: InformationEnhancer,
37
+ chunker: Chunker,
38
+ document_deleter: DocumentDeleter,
39
+ rag_api: RagApi,
40
+ information_mapper: InformationPiece2Document,
41
+ settings: SourceUploaderSettings,
42
+ ):
43
+ """
44
+ Initialize the DefaultSourceUploader.
45
+
46
+ Parameters
47
+ ----------
48
+ extractor_api : ExtractorApi
49
+ Client for the Extraction service.
50
+ key_value_store : FileStatusKeyValueStore
51
+ The key-value store for storing filename and the corresponding status.
52
+ information_enhancer : InformationEnhancer
53
+ The service for enhancing information.
54
+ chunker : Chunker
55
+ The service for chunking documents into chunks.
56
+ document_deleter : DocumentDeleter
57
+ The service for deleting documents.
58
+ rag_api : RagApi
59
+ The API for RAG backend.
60
+ information_mapper : InformationPiece2Document
61
+ The mapper for converting information pieces to langchain documents.
62
+ """
63
+ super().__init__()
64
+ self._extractor_api = extractor_api
65
+ self._rag_api = rag_api
66
+ self._key_value_store = key_value_store
67
+ self._information_mapper = information_mapper
68
+ self._information_enhancer = information_enhancer
69
+ self._chunker = chunker
70
+ self._document_deleter = document_deleter
71
+ self._background_threads = []
72
+ self._settings = settings
73
+
74
+ async def upload_source(
75
+ self,
76
+ source_type: StrictStr,
77
+ name: StrictStr,
78
+ kwargs: list[KeyValuePair],
79
+ ) -> None:
80
+ """
81
+ Uploads the parameters for source content extraction.
82
+
83
+ Parameters
84
+ ----------
85
+ source_type : str
86
+ The type of the source. Is used by the extractor service to determine the correct extraction method.
87
+ name : str
88
+ Display name of the source.
89
+ kwargs : list[KeyValuePair]
90
+ List of KeyValuePair with parameters used for the extraction.
91
+ timeout : float, optional
92
+ Timeout for the operation, by default 3600.0 seconds (1 hour).
93
+
94
+ Returns
95
+ -------
96
+ None
97
+ """
98
+
99
+ self._prune_background_threads()
100
+
101
+ source_name = f"{source_type}:{sanitize_document_name(name)}"
102
+ try:
103
+ self._check_if_already_in_processing(source_name)
104
+ self._key_value_store.upsert(source_name, Status.PROCESSING)
105
+
106
+ thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, self._settings.timeout))
107
+ thread.start()
108
+ self._background_threads.append(thread)
109
+ except ValueError as e:
110
+ self._key_value_store.upsert(source_name, Status.ERROR)
111
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
112
+ except Exception as e:
113
+ self._key_value_store.upsert(source_name, Status.ERROR)
114
+ logger.error("Error while uploading %s = %s", source_name, str(e))
115
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
116
+
117
+ def _check_if_already_in_processing(self, source_name: str) -> None:
118
+ """
119
+ Checks if the source is already in processing state.
120
+
121
+ Parameters
122
+ ----------
123
+ source_name : str
124
+ The name of the source.
125
+
126
+ Returns
127
+ -------
128
+ None
129
+
130
+ Raises
131
+ ------
132
+ ValueError
133
+ If the source is already in processing state.
134
+ """
135
+ existing = [s for name, s in self._key_value_store.get_all() if name == source_name]
136
+ if any(s == Status.PROCESSING for s in existing):
137
+ raise ValueError(f"Document {source_name} is already in processing state")
138
+
139
+ def _thread_worker(self, source_name, source_type, kwargs, timeout):
140
+ loop = asyncio.new_event_loop()
141
+ asyncio.set_event_loop(loop)
142
+ try:
143
+ loop.run_until_complete(
144
+ asyncio.wait_for(
145
+ self._handle_source_upload(source_name=source_name, source_type=source_type, kwargs=kwargs),
146
+ timeout=timeout,
147
+ )
148
+ )
149
+ except asyncio.TimeoutError:
150
+ logger.error("Upload of %s timed out after %s seconds", source_name, timeout)
151
+ self._key_value_store.upsert(source_name, Status.ERROR)
152
+ except Exception:
153
+ logger.error("Error while uploading %s", source_name)
154
+ self._key_value_store.upsert(source_name, Status.ERROR)
155
+ finally:
156
+ loop.close()
157
+
158
+ async def _handle_source_upload(
159
+ self,
160
+ source_name: str,
161
+ source_type: StrictStr,
162
+ kwargs: list[KeyValuePair],
163
+ ):
164
+ try:
165
+ # Run blocking extractor API call in thread pool to avoid blocking event loop
166
+ information_pieces = await asyncio.to_thread(
167
+ self._extractor_api.extract_from_source,
168
+ ExtractionParameters(
169
+ source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]
170
+ ),
171
+ )
172
+
173
+ if not information_pieces:
174
+ self._key_value_store.upsert(source_name, Status.ERROR)
175
+ logger.error("No information pieces found in the document: %s", source_name)
176
+ raise Exception("No information pieces found")
177
+ documents: list[Document] = []
178
+ for piece in information_pieces:
179
+ documents.append(self._information_mapper.extractor_information_piece2document(piece))
180
+
181
+ # Run blocking chunker call in thread pool to avoid blocking event loop
182
+ chunked_documents = await asyncio.to_thread(self._chunker.chunk, documents)
183
+
184
+ # limit concurrency to avoid spawning multiple threads per call
185
+ enhanced_documents = await self._information_enhancer.ainvoke(
186
+ chunked_documents, config={"max_concurrency": 1}
187
+ )
188
+
189
+ rag_information_pieces: list[RagInformationPiece] = []
190
+ for doc in enhanced_documents:
191
+ rag_information_pieces.append(self._information_mapper.document2rag_information_piece(doc))
192
+
193
+ with suppress(Exception):
194
+ await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False)
195
+
196
+ # Run blocking RAG API call in thread pool to avoid blocking event loop
197
+ await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
198
+ self._key_value_store.upsert(source_name, Status.READY)
199
+ logger.info("Source uploaded successfully: %s", source_name)
200
+ except Exception as e:
201
+ self._key_value_store.upsert(source_name, Status.ERROR)
202
+ logger.error("Error while uploading %s = %s", source_name, str(e))
File without changes
@@ -0,0 +1,11 @@
1
+ """Module containing the ChunkerType enumeration."""
2
+
3
+ from enum import StrEnum, unique
4
+
5
+
6
+ @unique
7
+ class ChunkerType(StrEnum):
8
+ """An enumeration representing different types of chunkers."""
9
+
10
+ SEMANTIC = "semantic"
11
+ RECURSIVE = "recursive"