admin-api-lib 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- admin_api_lib/__init__.py +0 -0
- admin_api_lib/api_endpoints/document_deleter.py +24 -0
- admin_api_lib/api_endpoints/document_reference_retriever.py +25 -0
- admin_api_lib/api_endpoints/documents_status_retriever.py +20 -0
- admin_api_lib/api_endpoints/file_uploader.py +31 -0
- admin_api_lib/api_endpoints/source_uploader.py +40 -0
- admin_api_lib/api_endpoints/uploader_base.py +30 -0
- admin_api_lib/apis/__init__.py +0 -0
- admin_api_lib/apis/admin_api.py +197 -0
- admin_api_lib/apis/admin_api_base.py +120 -0
- admin_api_lib/chunker/__init__.py +0 -0
- admin_api_lib/chunker/chunker.py +25 -0
- admin_api_lib/dependency_container.py +236 -0
- admin_api_lib/extractor_api_client/__init__.py +0 -0
- admin_api_lib/extractor_api_client/openapi_client/__init__.py +38 -0
- admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +4 -0
- admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +516 -0
- admin_api_lib/extractor_api_client/openapi_client/api_client.py +695 -0
- admin_api_lib/extractor_api_client/openapi_client/api_response.py +20 -0
- admin_api_lib/extractor_api_client/openapi_client/configuration.py +460 -0
- admin_api_lib/extractor_api_client/openapi_client/exceptions.py +197 -0
- admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +21 -0
- admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +34 -0
- admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +103 -0
- admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +82 -0
- admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +104 -0
- admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +92 -0
- admin_api_lib/extractor_api_client/openapi_client/rest.py +209 -0
- admin_api_lib/extractor_api_client/openapi_client/test/__init__.py +0 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +35 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py +59 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +56 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +39 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +62 -0
- admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +54 -0
- admin_api_lib/file_services/file_service.py +77 -0
- admin_api_lib/impl/__init__.py +0 -0
- admin_api_lib/impl/admin_api.py +167 -0
- admin_api_lib/impl/api_endpoints/default_document_deleter.py +84 -0
- admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py +72 -0
- admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py +41 -0
- admin_api_lib/impl/api_endpoints/default_file_uploader.py +234 -0
- admin_api_lib/impl/api_endpoints/default_source_uploader.py +202 -0
- admin_api_lib/impl/chunker/__init__.py +0 -0
- admin_api_lib/impl/chunker/chunker_type.py +11 -0
- admin_api_lib/impl/chunker/semantic_text_chunker.py +252 -0
- admin_api_lib/impl/chunker/text_chunker.py +33 -0
- admin_api_lib/impl/file_services/__init__.py +0 -0
- admin_api_lib/impl/file_services/s3_service.py +130 -0
- admin_api_lib/impl/information_enhancer/__init__.py +0 -0
- admin_api_lib/impl/information_enhancer/general_enhancer.py +52 -0
- admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +62 -0
- admin_api_lib/impl/information_enhancer/summary_enhancer.py +74 -0
- admin_api_lib/impl/key_db/__init__.py +0 -0
- admin_api_lib/impl/key_db/file_status_key_value_store.py +111 -0
- admin_api_lib/impl/mapper/informationpiece2document.py +108 -0
- admin_api_lib/impl/settings/__init__.py +0 -0
- admin_api_lib/impl/settings/chunker_class_type_settings.py +18 -0
- admin_api_lib/impl/settings/chunker_settings.py +29 -0
- admin_api_lib/impl/settings/document_extractor_settings.py +21 -0
- admin_api_lib/impl/settings/key_value_settings.py +26 -0
- admin_api_lib/impl/settings/rag_api_settings.py +21 -0
- admin_api_lib/impl/settings/s3_settings.py +31 -0
- admin_api_lib/impl/settings/source_uploader_settings.py +23 -0
- admin_api_lib/impl/settings/summarizer_settings.py +86 -0
- admin_api_lib/impl/summarizer/__init__.py +0 -0
- admin_api_lib/impl/summarizer/langchain_summarizer.py +117 -0
- admin_api_lib/information_enhancer/__init__.py +0 -0
- admin_api_lib/information_enhancer/information_enhancer.py +34 -0
- admin_api_lib/main.py +54 -0
- admin_api_lib/models/__init__.py +0 -0
- admin_api_lib/models/document_status.py +86 -0
- admin_api_lib/models/extra_models.py +9 -0
- admin_api_lib/models/http_validation_error.py +105 -0
- admin_api_lib/models/key_value_pair.py +85 -0
- admin_api_lib/models/status.py +44 -0
- admin_api_lib/models/validation_error.py +104 -0
- admin_api_lib/models/validation_error_loc_inner.py +114 -0
- admin_api_lib/prompt_templates/__init__.py +0 -0
- admin_api_lib/prompt_templates/summarize_prompt.py +14 -0
- admin_api_lib/rag_backend_client/__init__.py +0 -0
- admin_api_lib/rag_backend_client/openapi_client/__init__.py +60 -0
- admin_api_lib/rag_backend_client/openapi_client/api/__init__.py +4 -0
- admin_api_lib/rag_backend_client/openapi_client/api/rag_api.py +968 -0
- admin_api_lib/rag_backend_client/openapi_client/api_client.py +698 -0
- admin_api_lib/rag_backend_client/openapi_client/api_response.py +22 -0
- admin_api_lib/rag_backend_client/openapi_client/configuration.py +460 -0
- admin_api_lib/rag_backend_client/openapi_client/exceptions.py +197 -0
- admin_api_lib/rag_backend_client/openapi_client/models/__init__.py +41 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_history.py +99 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_history_message.py +83 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_request.py +93 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_response.py +103 -0
- admin_api_lib/rag_backend_client/openapi_client/models/chat_role.py +35 -0
- admin_api_lib/rag_backend_client/openapi_client/models/content_type.py +37 -0
- admin_api_lib/rag_backend_client/openapi_client/models/delete_request.py +99 -0
- admin_api_lib/rag_backend_client/openapi_client/models/information_piece.py +110 -0
- admin_api_lib/rag_backend_client/openapi_client/models/key_value_pair.py +83 -0
- admin_api_lib/rag_backend_client/openapi_client/rest.py +209 -0
- admin_api_lib/summarizer/__init__.py +0 -0
- admin_api_lib/summarizer/summarizer.py +33 -0
- admin_api_lib/utils/__init__.py +0 -0
- admin_api_lib/utils/utils.py +32 -0
- admin_api_lib-3.2.0.dist-info/METADATA +24 -0
- admin_api_lib-3.2.0.dist-info/RECORD +106 -0
- admin_api_lib-3.2.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Module for the DefaultDocumentsStatusRetriever class."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from admin_api_lib.api_endpoints.documents_status_retriever import (
|
|
6
|
+
DocumentsStatusRetriever,
|
|
7
|
+
)
|
|
8
|
+
from admin_api_lib.impl.key_db.file_status_key_value_store import (
|
|
9
|
+
FileStatusKeyValueStore,
|
|
10
|
+
)
|
|
11
|
+
from admin_api_lib.models.document_status import DocumentStatus
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DefaultDocumentsStatusRetriever(DocumentsStatusRetriever):
|
|
17
|
+
"""The DefaultDocumentsStatusRetriever retrieves the status of all documents from a key-value store."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, key_value_store: FileStatusKeyValueStore):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the DefaultDocumentsStatusRetriever.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
key_value_store : FileStatusKeyValueStore
|
|
26
|
+
The key-value store for storing filename and the corresponding status.
|
|
27
|
+
"""
|
|
28
|
+
self._key_value_store = key_value_store
|
|
29
|
+
|
|
30
|
+
async def aget_all_documents_status(self) -> list[DocumentStatus]:
|
|
31
|
+
"""
|
|
32
|
+
Asynchronously retrieves the status of all documents.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
list[DocumentStatus]
|
|
37
|
+
A list containing the status of all documents, where each document's
|
|
38
|
+
status is represented by a DocumentStatus object.
|
|
39
|
+
"""
|
|
40
|
+
all_documents = self._key_value_store.get_all()
|
|
41
|
+
return [DocumentStatus(name=x[0], status=x[1]) for x in all_documents]
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import traceback
|
|
4
|
+
import urllib
|
|
5
|
+
import tempfile
|
|
6
|
+
import asyncio
|
|
7
|
+
from contextlib import suppress
|
|
8
|
+
|
|
9
|
+
from fastapi import UploadFile, status, HTTPException
|
|
10
|
+
from langchain_core.documents import Document
|
|
11
|
+
|
|
12
|
+
from admin_api_lib.file_services.file_service import FileService
|
|
13
|
+
from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest
|
|
14
|
+
from admin_api_lib.api_endpoints.file_uploader import FileUploader
|
|
15
|
+
from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi
|
|
16
|
+
from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi
|
|
17
|
+
from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document
|
|
18
|
+
from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter
|
|
19
|
+
from admin_api_lib.chunker.chunker import Chunker
|
|
20
|
+
from admin_api_lib.models.status import Status
|
|
21
|
+
from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore
|
|
22
|
+
from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer
|
|
23
|
+
from admin_api_lib.utils.utils import sanitize_document_name
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DefaultFileUploader(FileUploader):
|
|
29
|
+
"""The DefaultFileUploader is responsible for adding a new source file document to the available content."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
extractor_api: ExtractorApi,
|
|
34
|
+
key_value_store: FileStatusKeyValueStore,
|
|
35
|
+
information_enhancer: InformationEnhancer,
|
|
36
|
+
chunker: Chunker,
|
|
37
|
+
document_deleter: DocumentDeleter,
|
|
38
|
+
rag_api: RagApi,
|
|
39
|
+
information_mapper: InformationPiece2Document,
|
|
40
|
+
file_service: FileService,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize the DefaultFileUploader.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
extractor_api : ExtractorApi
|
|
48
|
+
Client for the Extraction service.
|
|
49
|
+
key_value_store : FileStatusKeyValueStore
|
|
50
|
+
The key-value store for storing filename and the corresponding status.
|
|
51
|
+
information_enhancer : InformationEnhancer
|
|
52
|
+
The service for enhancing information.
|
|
53
|
+
chunker : Chunker
|
|
54
|
+
The service for chunking documents into chunks.
|
|
55
|
+
document_deleter : DocumentDeleter
|
|
56
|
+
The service for deleting documents.
|
|
57
|
+
rag_api : RagApi
|
|
58
|
+
The API for RAG backend.
|
|
59
|
+
information_mapper : InformationPiece2Document
|
|
60
|
+
The mapper for converting information pieces to langchain documents.
|
|
61
|
+
file_service : FileService
|
|
62
|
+
The service for handling file operations on the S3 storage
|
|
63
|
+
"""
|
|
64
|
+
super().__init__()
|
|
65
|
+
self._extractor_api = extractor_api
|
|
66
|
+
self._rag_api = rag_api
|
|
67
|
+
self._key_value_store = key_value_store
|
|
68
|
+
self._information_mapper = information_mapper
|
|
69
|
+
self._information_enhancer = information_enhancer
|
|
70
|
+
self._chunker = chunker
|
|
71
|
+
self._document_deleter = document_deleter
|
|
72
|
+
self._background_tasks = []
|
|
73
|
+
self._file_service = file_service
|
|
74
|
+
|
|
75
|
+
async def upload_file(
|
|
76
|
+
self,
|
|
77
|
+
base_url: str,
|
|
78
|
+
file: UploadFile,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Uploads a source file for content extraction.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
base_url : str
|
|
86
|
+
The base url of the service. Is used to determine the download link of the file.
|
|
87
|
+
file : UploadFile
|
|
88
|
+
The file to process.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
None
|
|
93
|
+
"""
|
|
94
|
+
self._prune_background_tasks()
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
file.filename = sanitize_document_name(file.filename)
|
|
98
|
+
source_name = f"file:{sanitize_document_name(file.filename)}"
|
|
99
|
+
self._check_if_already_in_processing(source_name)
|
|
100
|
+
self._key_value_store.upsert(source_name, Status.PROCESSING)
|
|
101
|
+
content = await file.read()
|
|
102
|
+
s3_path = await self._asave_new_document(content, file.filename, source_name)
|
|
103
|
+
|
|
104
|
+
task = asyncio.create_task(self._handle_source_upload(s3_path, source_name, file.filename, base_url))
|
|
105
|
+
task.add_done_callback(self._log_task_exception)
|
|
106
|
+
self._background_tasks.append(task)
|
|
107
|
+
except ValueError as e:
|
|
108
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
109
|
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
|
|
110
|
+
except Exception as e:
|
|
111
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
112
|
+
logger.error("Error while uploading %s = %s", source_name, str(e))
|
|
113
|
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
|
114
|
+
|
|
115
|
+
def _log_task_exception(self, task: asyncio.Task) -> None:
|
|
116
|
+
"""
|
|
117
|
+
Log exceptions from completed background tasks.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
task : asyncio.Task
|
|
122
|
+
The completed task to check for exceptions.
|
|
123
|
+
"""
|
|
124
|
+
if task.done() and not task.cancelled():
|
|
125
|
+
try:
|
|
126
|
+
task.result() # This will raise the exception if one occurred
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.error("Background task failed with exception: %s", str(e))
|
|
129
|
+
logger.debug("Background task exception traceback: %s", traceback.format_exc())
|
|
130
|
+
|
|
131
|
+
def _prune_background_tasks(self) -> None:
|
|
132
|
+
"""
|
|
133
|
+
Remove completed background tasks from the list.
|
|
134
|
+
"""
|
|
135
|
+
self._background_tasks = [task for task in self._background_tasks if not task.done()]
|
|
136
|
+
|
|
137
|
+
def _check_if_already_in_processing(self, source_name: str) -> None:
|
|
138
|
+
"""
|
|
139
|
+
Checks if the source is already in processing state.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
source_name : str
|
|
144
|
+
The name of the source.
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
None
|
|
149
|
+
|
|
150
|
+
Raises
|
|
151
|
+
------
|
|
152
|
+
ValueError
|
|
153
|
+
If the source is already in processing state.
|
|
154
|
+
"""
|
|
155
|
+
existing = [s for name, s in self._key_value_store.get_all() if name == source_name]
|
|
156
|
+
if any(s == Status.PROCESSING for s in existing):
|
|
157
|
+
raise ValueError(f"Document {source_name} is already in processing state")
|
|
158
|
+
|
|
159
|
+
async def _handle_source_upload(
|
|
160
|
+
self,
|
|
161
|
+
s3_path: Path,
|
|
162
|
+
source_name: str,
|
|
163
|
+
file_name: str,
|
|
164
|
+
base_url: str,
|
|
165
|
+
):
|
|
166
|
+
try:
|
|
167
|
+
# Run blocking extractor API call in thread pool to avoid blocking event loop
|
|
168
|
+
information_pieces = await asyncio.to_thread(
|
|
169
|
+
self._extractor_api.extract_from_file_post,
|
|
170
|
+
ExtractionRequest(path_on_s3=str(s3_path), document_name=source_name),
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if not information_pieces:
|
|
174
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
175
|
+
logger.error("No information pieces found in the document: %s", source_name)
|
|
176
|
+
raise Exception("No information pieces found")
|
|
177
|
+
documents: list[Document] = []
|
|
178
|
+
for piece in information_pieces:
|
|
179
|
+
documents.append(self._information_mapper.extractor_information_piece2document(piece))
|
|
180
|
+
|
|
181
|
+
# Run blocking chunker call in thread pool to avoid blocking event loop
|
|
182
|
+
chunked_documents = await asyncio.to_thread(self._chunker.chunk, documents)
|
|
183
|
+
|
|
184
|
+
enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents)
|
|
185
|
+
self._add_file_url(file_name, base_url, enhanced_documents)
|
|
186
|
+
|
|
187
|
+
rag_information_pieces = [
|
|
188
|
+
self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents
|
|
189
|
+
]
|
|
190
|
+
# Replace old document
|
|
191
|
+
# deletion is allowed to fail
|
|
192
|
+
with suppress(Exception):
|
|
193
|
+
await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False)
|
|
194
|
+
|
|
195
|
+
# Run blocking RAG API call in thread pool to avoid blocking event loop
|
|
196
|
+
await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
|
|
197
|
+
self._key_value_store.upsert(source_name, Status.READY)
|
|
198
|
+
logger.info("Source uploaded successfully: %s", source_name)
|
|
199
|
+
except Exception as e:
|
|
200
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
201
|
+
logger.error("Error while uploading %s = %s", source_name, str(e))
|
|
202
|
+
|
|
203
|
+
def _add_file_url(self, file_name: str, base_url: str, chunked_documents: list[Document]):
|
|
204
|
+
document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file_name)}"
|
|
205
|
+
for idx, chunk in enumerate(chunked_documents):
|
|
206
|
+
if chunk.metadata["id"] in chunk.metadata["related"]:
|
|
207
|
+
chunk.metadata["related"].remove(chunk.metadata["id"])
|
|
208
|
+
chunk.metadata.update(
|
|
209
|
+
{
|
|
210
|
+
"chunk": idx,
|
|
211
|
+
"chunk_length": len(chunk.page_content),
|
|
212
|
+
"document_url": document_url,
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
async def _asave_new_document(
|
|
217
|
+
self,
|
|
218
|
+
file_content: bytes,
|
|
219
|
+
filename: str,
|
|
220
|
+
source_name: str,
|
|
221
|
+
) -> Path:
|
|
222
|
+
try:
|
|
223
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
224
|
+
temp_file_path = Path(temp_dir) / filename
|
|
225
|
+
with open(temp_file_path, "wb") as temp_file:
|
|
226
|
+
logger.debug("Temporary file created at %s.", temp_file_path)
|
|
227
|
+
temp_file.write(file_content)
|
|
228
|
+
logger.debug("Temp file created and content written.")
|
|
229
|
+
|
|
230
|
+
self._file_service.upload_file(Path(temp_file_path), filename)
|
|
231
|
+
return filename
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error("Error during document saving: %s %s", e, traceback.format_exc())
|
|
234
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import asyncio
|
|
3
|
+
from threading import Thread
|
|
4
|
+
from contextlib import suppress
|
|
5
|
+
|
|
6
|
+
from pydantic import StrictStr
|
|
7
|
+
from fastapi import status, HTTPException
|
|
8
|
+
from langchain_core.documents import Document
|
|
9
|
+
|
|
10
|
+
from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi
|
|
11
|
+
from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters
|
|
12
|
+
from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings
|
|
13
|
+
from admin_api_lib.models.key_value_pair import KeyValuePair
|
|
14
|
+
from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi
|
|
15
|
+
from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document
|
|
16
|
+
from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter
|
|
17
|
+
from admin_api_lib.api_endpoints.source_uploader import SourceUploader
|
|
18
|
+
from admin_api_lib.chunker.chunker import Chunker
|
|
19
|
+
from admin_api_lib.models.status import Status
|
|
20
|
+
from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore
|
|
21
|
+
from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer
|
|
22
|
+
from admin_api_lib.utils.utils import sanitize_document_name
|
|
23
|
+
from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import (
|
|
24
|
+
InformationPiece as RagInformationPiece,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DefaultSourceUploader(SourceUploader):
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
extractor_api: ExtractorApi,
|
|
35
|
+
key_value_store: FileStatusKeyValueStore,
|
|
36
|
+
information_enhancer: InformationEnhancer,
|
|
37
|
+
chunker: Chunker,
|
|
38
|
+
document_deleter: DocumentDeleter,
|
|
39
|
+
rag_api: RagApi,
|
|
40
|
+
information_mapper: InformationPiece2Document,
|
|
41
|
+
settings: SourceUploaderSettings,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
Initialize the DefaultSourceUploader.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
extractor_api : ExtractorApi
|
|
49
|
+
Client for the Extraction service.
|
|
50
|
+
key_value_store : FileStatusKeyValueStore
|
|
51
|
+
The key-value store for storing filename and the corresponding status.
|
|
52
|
+
information_enhancer : InformationEnhancer
|
|
53
|
+
The service for enhancing information.
|
|
54
|
+
chunker : Chunker
|
|
55
|
+
The service for chunking documents into chunks.
|
|
56
|
+
document_deleter : DocumentDeleter
|
|
57
|
+
The service for deleting documents.
|
|
58
|
+
rag_api : RagApi
|
|
59
|
+
The API for RAG backend.
|
|
60
|
+
information_mapper : InformationPiece2Document
|
|
61
|
+
The mapper for converting information pieces to langchain documents.
|
|
62
|
+
"""
|
|
63
|
+
super().__init__()
|
|
64
|
+
self._extractor_api = extractor_api
|
|
65
|
+
self._rag_api = rag_api
|
|
66
|
+
self._key_value_store = key_value_store
|
|
67
|
+
self._information_mapper = information_mapper
|
|
68
|
+
self._information_enhancer = information_enhancer
|
|
69
|
+
self._chunker = chunker
|
|
70
|
+
self._document_deleter = document_deleter
|
|
71
|
+
self._background_threads = []
|
|
72
|
+
self._settings = settings
|
|
73
|
+
|
|
74
|
+
async def upload_source(
|
|
75
|
+
self,
|
|
76
|
+
source_type: StrictStr,
|
|
77
|
+
name: StrictStr,
|
|
78
|
+
kwargs: list[KeyValuePair],
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Uploads the parameters for source content extraction.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
source_type : str
|
|
86
|
+
The type of the source. Is used by the extractor service to determine the correct extraction method.
|
|
87
|
+
name : str
|
|
88
|
+
Display name of the source.
|
|
89
|
+
kwargs : list[KeyValuePair]
|
|
90
|
+
List of KeyValuePair with parameters used for the extraction.
|
|
91
|
+
timeout : float, optional
|
|
92
|
+
Timeout for the operation, by default 3600.0 seconds (1 hour).
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
None
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
self._prune_background_threads()
|
|
100
|
+
|
|
101
|
+
source_name = f"{source_type}:{sanitize_document_name(name)}"
|
|
102
|
+
try:
|
|
103
|
+
self._check_if_already_in_processing(source_name)
|
|
104
|
+
self._key_value_store.upsert(source_name, Status.PROCESSING)
|
|
105
|
+
|
|
106
|
+
thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, self._settings.timeout))
|
|
107
|
+
thread.start()
|
|
108
|
+
self._background_threads.append(thread)
|
|
109
|
+
except ValueError as e:
|
|
110
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
111
|
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
|
|
112
|
+
except Exception as e:
|
|
113
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
114
|
+
logger.error("Error while uploading %s = %s", source_name, str(e))
|
|
115
|
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
|
116
|
+
|
|
117
|
+
def _check_if_already_in_processing(self, source_name: str) -> None:
|
|
118
|
+
"""
|
|
119
|
+
Checks if the source is already in processing state.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
source_name : str
|
|
124
|
+
The name of the source.
|
|
125
|
+
|
|
126
|
+
Returns
|
|
127
|
+
-------
|
|
128
|
+
None
|
|
129
|
+
|
|
130
|
+
Raises
|
|
131
|
+
------
|
|
132
|
+
ValueError
|
|
133
|
+
If the source is already in processing state.
|
|
134
|
+
"""
|
|
135
|
+
existing = [s for name, s in self._key_value_store.get_all() if name == source_name]
|
|
136
|
+
if any(s == Status.PROCESSING for s in existing):
|
|
137
|
+
raise ValueError(f"Document {source_name} is already in processing state")
|
|
138
|
+
|
|
139
|
+
def _thread_worker(self, source_name, source_type, kwargs, timeout):
|
|
140
|
+
loop = asyncio.new_event_loop()
|
|
141
|
+
asyncio.set_event_loop(loop)
|
|
142
|
+
try:
|
|
143
|
+
loop.run_until_complete(
|
|
144
|
+
asyncio.wait_for(
|
|
145
|
+
self._handle_source_upload(source_name=source_name, source_type=source_type, kwargs=kwargs),
|
|
146
|
+
timeout=timeout,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
except asyncio.TimeoutError:
|
|
150
|
+
logger.error("Upload of %s timed out after %s seconds", source_name, timeout)
|
|
151
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
152
|
+
except Exception:
|
|
153
|
+
logger.error("Error while uploading %s", source_name)
|
|
154
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
155
|
+
finally:
|
|
156
|
+
loop.close()
|
|
157
|
+
|
|
158
|
+
async def _handle_source_upload(
|
|
159
|
+
self,
|
|
160
|
+
source_name: str,
|
|
161
|
+
source_type: StrictStr,
|
|
162
|
+
kwargs: list[KeyValuePair],
|
|
163
|
+
):
|
|
164
|
+
try:
|
|
165
|
+
# Run blocking extractor API call in thread pool to avoid blocking event loop
|
|
166
|
+
information_pieces = await asyncio.to_thread(
|
|
167
|
+
self._extractor_api.extract_from_source,
|
|
168
|
+
ExtractionParameters(
|
|
169
|
+
source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]
|
|
170
|
+
),
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if not information_pieces:
|
|
174
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
175
|
+
logger.error("No information pieces found in the document: %s", source_name)
|
|
176
|
+
raise Exception("No information pieces found")
|
|
177
|
+
documents: list[Document] = []
|
|
178
|
+
for piece in information_pieces:
|
|
179
|
+
documents.append(self._information_mapper.extractor_information_piece2document(piece))
|
|
180
|
+
|
|
181
|
+
# Run blocking chunker call in thread pool to avoid blocking event loop
|
|
182
|
+
chunked_documents = await asyncio.to_thread(self._chunker.chunk, documents)
|
|
183
|
+
|
|
184
|
+
# limit concurrency to avoid spawning multiple threads per call
|
|
185
|
+
enhanced_documents = await self._information_enhancer.ainvoke(
|
|
186
|
+
chunked_documents, config={"max_concurrency": 1}
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
rag_information_pieces: list[RagInformationPiece] = []
|
|
190
|
+
for doc in enhanced_documents:
|
|
191
|
+
rag_information_pieces.append(self._information_mapper.document2rag_information_piece(doc))
|
|
192
|
+
|
|
193
|
+
with suppress(Exception):
|
|
194
|
+
await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False)
|
|
195
|
+
|
|
196
|
+
# Run blocking RAG API call in thread pool to avoid blocking event loop
|
|
197
|
+
await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
|
|
198
|
+
self._key_value_store.upsert(source_name, Status.READY)
|
|
199
|
+
logger.info("Source uploaded successfully: %s", source_name)
|
|
200
|
+
except Exception as e:
|
|
201
|
+
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
202
|
+
logger.error("Error while uploading %s = %s", source_name, str(e))
|
|
File without changes
|