ws-bom-robot-app 0.0.73__py3-none-any.whl → 0.0.75__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/llm/agent_description.py +123 -123
- ws_bom_robot_app/llm/agent_handler.py +177 -177
- ws_bom_robot_app/llm/agent_lcel.py +50 -50
- ws_bom_robot_app/llm/defaut_prompt.py +15 -15
- ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -66
- ws_bom_robot_app/llm/main.py +138 -138
- ws_bom_robot_app/llm/models/feedback.py +30 -30
- ws_bom_robot_app/llm/nebuly_handler.py +181 -181
- ws_bom_robot_app/llm/settings.py +4 -4
- ws_bom_robot_app/llm/tools/tool_builder.py +65 -65
- ws_bom_robot_app/llm/tools/tool_manager.py +330 -330
- ws_bom_robot_app/llm/tools/utils.py +41 -41
- ws_bom_robot_app/llm/utils/agent.py +34 -34
- ws_bom_robot_app/llm/utils/cms.py +114 -114
- ws_bom_robot_app/llm/utils/download.py +79 -79
- ws_bom_robot_app/llm/utils/print.py +29 -29
- ws_bom_robot_app/llm/vector_store/db/base.py +47 -0
- ws_bom_robot_app/llm/vector_store/db/chroma.py +27 -8
- ws_bom_robot_app/llm/vector_store/db/faiss.py +34 -8
- ws_bom_robot_app/llm/vector_store/generator.py +137 -137
- ws_bom_robot_app/llm/vector_store/integration/thron.py +103 -123
- ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
- {ws_bom_robot_app-0.0.73.dist-info → ws_bom_robot_app-0.0.75.dist-info}/METADATA +4 -4
- {ws_bom_robot_app-0.0.73.dist-info → ws_bom_robot_app-0.0.75.dist-info}/RECORD +26 -26
- {ws_bom_robot_app-0.0.73.dist-info → ws_bom_robot_app-0.0.75.dist-info}/WHEEL +0 -0
- {ws_bom_robot_app-0.0.73.dist-info → ws_bom_robot_app-0.0.75.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ from langchain_core.language_models import BaseChatModel
|
|
|
7
7
|
from langchain_core.vectorstores.base import VectorStoreRetriever, VectorStore
|
|
8
8
|
from langchain.retrievers import SelfQueryRetriever
|
|
9
9
|
from langchain.chains.query_constructor.schema import AttributeInfo
|
|
10
|
+
import tiktoken
|
|
10
11
|
|
|
11
12
|
class VectorDBStrategy(ABC):
|
|
12
13
|
class VectorDBStrategy:
|
|
@@ -49,6 +50,52 @@ class VectorDBStrategy(ABC):
|
|
|
49
50
|
Asynchronously invokes multiple retrievers in parallel, then merges
|
|
50
51
|
their results while removing duplicates.
|
|
51
52
|
"""
|
|
53
|
+
def __init__(self):
|
|
54
|
+
self.max_tokens_per_batch = 300_000 * 0.8 # conservative limit below 300k openai limit: https://platform.openai.com/docs/api-reference/embeddings/create
|
|
55
|
+
try:
|
|
56
|
+
self.encoding = tiktoken.get_encoding("cl100k_base") # text-embedding-3-small, text-embedding-3-large: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
|
|
57
|
+
except Exception:
|
|
58
|
+
self.encoding = None
|
|
59
|
+
|
|
60
|
+
def _count_tokens(self, text: str) -> int:
|
|
61
|
+
"""Count tokens in text using tiktoken or fallback estimation"""
|
|
62
|
+
if self.encoding:
|
|
63
|
+
try:
|
|
64
|
+
return len(self.encoding.encode(text))
|
|
65
|
+
except Exception:
|
|
66
|
+
pass
|
|
67
|
+
# fallback: rough estimation (1 token ≈ 4 characters)
|
|
68
|
+
return len(text) // 4
|
|
69
|
+
|
|
70
|
+
def _batch_documents_by_tokens(self, documents: list[Document]) -> list[list[Document]]:
|
|
71
|
+
"""Split documents into batches based on token count"""
|
|
72
|
+
if not documents:
|
|
73
|
+
return []
|
|
74
|
+
batches = []
|
|
75
|
+
current_batch = []
|
|
76
|
+
current_token_count = 0
|
|
77
|
+
|
|
78
|
+
for doc in documents:
|
|
79
|
+
doc_tokens = self._count_tokens(doc.page_content)
|
|
80
|
+
# check if adding this document exceeds the limit
|
|
81
|
+
if current_token_count + doc_tokens > self.max_tokens_per_batch:
|
|
82
|
+
# start new batch if current batch is not empty
|
|
83
|
+
if current_batch:
|
|
84
|
+
batches.append(current_batch)
|
|
85
|
+
# reset current batch
|
|
86
|
+
current_batch = [doc]
|
|
87
|
+
current_token_count = doc_tokens # reset to current doc's tokens
|
|
88
|
+
else:
|
|
89
|
+
# add to current batch
|
|
90
|
+
current_batch.append(doc)
|
|
91
|
+
current_token_count += doc_tokens
|
|
92
|
+
|
|
93
|
+
# add final batch if not empty
|
|
94
|
+
if current_batch:
|
|
95
|
+
batches.append(current_batch)
|
|
96
|
+
|
|
97
|
+
return batches
|
|
98
|
+
|
|
52
99
|
_CACHE: dict[str, VectorStore] = {}
|
|
53
100
|
def _clear_cache(self, key: str):
|
|
54
101
|
if key in self._CACHE:
|
|
@@ -38,6 +38,9 @@ class Chroma(VectorDBStrategy):
|
|
|
38
38
|
Returns:
|
|
39
39
|
CHROMA: The retrieved or newly created Chroma instance.
|
|
40
40
|
"""
|
|
41
|
+
def __init__(self):
|
|
42
|
+
super().__init__()
|
|
43
|
+
|
|
41
44
|
async def create(
|
|
42
45
|
self,
|
|
43
46
|
embeddings: Embeddings,
|
|
@@ -47,19 +50,35 @@ class Chroma(VectorDBStrategy):
|
|
|
47
50
|
) -> Optional[str]:
|
|
48
51
|
try:
|
|
49
52
|
chunked_docs = DocumentChunker.chunk(documents)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
54
|
+
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
55
|
+
_instance: CHROMA = None
|
|
56
|
+
for i, batch in enumerate(batches):
|
|
57
|
+
batch_tokens = sum(self._count_tokens(doc.page_content) for doc in batch)
|
|
58
|
+
logging.info(f"processing batch {i+1}/{len(batches)} with {len(batch)} docs ({batch_tokens:,} tokens)")
|
|
59
|
+
# create instance from first batch
|
|
60
|
+
if _instance is None:
|
|
61
|
+
_instance = await asyncio.to_thread(
|
|
62
|
+
CHROMA.from_documents,
|
|
63
|
+
documents=batch,
|
|
64
|
+
embedding=embeddings,
|
|
65
|
+
persist_directory=storage_id
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
# merge to existing instance
|
|
69
|
+
await _instance.aadd_documents(batch)
|
|
70
|
+
# add a small delay to avoid rate limiting
|
|
71
|
+
if i < len(batches) - 1: # except last batch
|
|
72
|
+
await asyncio.sleep(1)
|
|
73
|
+
if _instance:
|
|
74
|
+
self._clear_cache(storage_id)
|
|
75
|
+
logging.info(f"Successfully created {Chroma.__name__} index with {len(chunked_docs)} total documents")
|
|
57
76
|
return storage_id
|
|
58
77
|
except Exception as e:
|
|
59
78
|
logging.error(f"{Chroma.__name__} create error: {e}")
|
|
60
79
|
raise e
|
|
61
80
|
finally:
|
|
62
|
-
del documents
|
|
81
|
+
del documents, chunked_docs, _instance
|
|
63
82
|
gc.collect()
|
|
64
83
|
|
|
65
84
|
def get_loader(
|
|
@@ -22,6 +22,9 @@ class Faiss(VectorDBStrategy):
|
|
|
22
22
|
was previously loaded and cached, it returns the cached instance; otherwise,
|
|
23
23
|
it loads the index from local storage and caches it for subsequent use.
|
|
24
24
|
"""
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
|
|
25
28
|
async def create(
|
|
26
29
|
self,
|
|
27
30
|
embeddings: Embeddings,
|
|
@@ -31,19 +34,42 @@ class Faiss(VectorDBStrategy):
|
|
|
31
34
|
) -> Optional[str]:
|
|
32
35
|
try:
|
|
33
36
|
chunked_docs = DocumentChunker.chunk(documents)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
37
|
+
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
38
|
+
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
39
|
+
_instance: FAISS = None
|
|
40
|
+
for i, batch in enumerate(batches):
|
|
41
|
+
batch_tokens = sum(self._count_tokens(doc.page_content) for doc in batch)
|
|
42
|
+
logging.info(f"processing batch {i+1}/{len(batches)} with {len(batch)} docs ({batch_tokens:,} tokens)")
|
|
43
|
+
# init
|
|
44
|
+
_batch_instance = await asyncio.to_thread(
|
|
45
|
+
FAISS.from_documents,
|
|
46
|
+
batch,
|
|
47
|
+
embeddings
|
|
48
|
+
)
|
|
49
|
+
# create instance from first batch
|
|
50
|
+
if _instance is None:
|
|
51
|
+
_instance = _batch_instance
|
|
52
|
+
else:
|
|
53
|
+
# merge to existing instance
|
|
54
|
+
await asyncio.to_thread(
|
|
55
|
+
_instance.merge_from,
|
|
56
|
+
_batch_instance
|
|
57
|
+
)
|
|
58
|
+
del _batch_instance
|
|
59
|
+
gc.collect()
|
|
60
|
+
# add a small delay to avoid rate limiting
|
|
61
|
+
if i < len(batches) - 1: # except last batch
|
|
62
|
+
await asyncio.sleep(1)
|
|
63
|
+
if _instance:
|
|
64
|
+
await asyncio.to_thread(_instance.save_local, storage_id)
|
|
65
|
+
self._clear_cache(storage_id)
|
|
66
|
+
logging.info(f"Successfully created {Faiss.__name__} index with {len(chunked_docs)} total documents")
|
|
41
67
|
return storage_id
|
|
42
68
|
except Exception as e:
|
|
43
69
|
logging.error(f"{Faiss.__name__} create error: {e}")
|
|
44
70
|
raise e
|
|
45
71
|
finally:
|
|
46
|
-
del documents, _instance
|
|
72
|
+
del documents, chunked_docs, _instance
|
|
47
73
|
gc.collect()
|
|
48
74
|
|
|
49
75
|
def get_loader(
|
|
@@ -1,137 +1,137 @@
|
|
|
1
|
-
import os, gc, shutil, logging, traceback
|
|
2
|
-
import asyncio, aiofiles, aiofiles.os
|
|
3
|
-
from fastapi import HTTPException
|
|
4
|
-
from fastapi.responses import StreamingResponse
|
|
5
|
-
from langchain_core.documents import Document
|
|
6
|
-
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
7
|
-
from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
|
|
8
|
-
from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
|
|
9
|
-
from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
|
|
10
|
-
from ws_bom_robot_app.config import config
|
|
11
|
-
from ws_bom_robot_app.llm.models.kb import load_endpoints
|
|
12
|
-
from ws_bom_robot_app.llm.utils.download import download_files
|
|
13
|
-
|
|
14
|
-
async def _cleanup_directory(directory_path: str):
|
|
15
|
-
if os.path.exists(directory_path):
|
|
16
|
-
await asyncio.to_thread(shutil.rmtree, directory_path)
|
|
17
|
-
|
|
18
|
-
#@timer
|
|
19
|
-
async def rules(rq: RulesRequest) -> VectorDbResponse:
|
|
20
|
-
_config = rq.config()
|
|
21
|
-
db_name = rq.out_name()
|
|
22
|
-
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
23
|
-
try:
|
|
24
|
-
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
|
|
25
|
-
db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
|
|
26
|
-
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
27
|
-
except Exception as e:
|
|
28
|
-
try:
|
|
29
|
-
await _cleanup_directory(store_path)
|
|
30
|
-
finally:
|
|
31
|
-
return VectorDbResponse(success = False, error = str(e))
|
|
32
|
-
finally:
|
|
33
|
-
gc.collect()
|
|
34
|
-
|
|
35
|
-
#@atimer
|
|
36
|
-
async def kb(rq: KbRequest) -> VectorDbResponse:
|
|
37
|
-
os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
|
|
38
|
-
_config = rq.config()
|
|
39
|
-
db_name = rq.out_name()
|
|
40
|
-
src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
|
|
41
|
-
working_path = os.path.join(src_path, db_name)
|
|
42
|
-
|
|
43
|
-
if all([not rq.files,not rq.endpoints,not rq.integrations]):
|
|
44
|
-
return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
|
|
45
|
-
else:
|
|
46
|
-
await aiofiles.os.makedirs(src_path, exist_ok=True)
|
|
47
|
-
await aiofiles.os.makedirs(working_path, exist_ok=True)
|
|
48
|
-
|
|
49
|
-
documents: list[Document] = []
|
|
50
|
-
# Download/copy all files
|
|
51
|
-
if rq.files:
|
|
52
|
-
try:
|
|
53
|
-
loaders = Loader(working_path)
|
|
54
|
-
filter_file_extensions = loaders.managed_file_extensions()
|
|
55
|
-
files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
|
|
56
|
-
if files_to_download:
|
|
57
|
-
await download_files(
|
|
58
|
-
[f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
|
|
59
|
-
src_path, authorization=_config.robot_cms_auth)
|
|
60
|
-
# copy files to working tmp folder
|
|
61
|
-
for file in rq.files:
|
|
62
|
-
async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
|
|
63
|
-
async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
|
|
64
|
-
await dest_file.write(await src_file.read())
|
|
65
|
-
#load files
|
|
66
|
-
try:
|
|
67
|
-
documents.extend(await loaders.load())
|
|
68
|
-
except Exception as e:
|
|
69
|
-
tb = traceback.format_exc()
|
|
70
|
-
_error = f"File loader failure: {e} | {tb}"
|
|
71
|
-
logging.warning(_error)
|
|
72
|
-
return VectorDbResponse(success = False, error = _error)
|
|
73
|
-
except Exception as e:
|
|
74
|
-
await _cleanup_directory(working_path)
|
|
75
|
-
return VectorDbResponse(success = False, error = f"Failed to download file {e}")
|
|
76
|
-
|
|
77
|
-
if rq.endpoints:
|
|
78
|
-
try:
|
|
79
|
-
documents.extend(await load_endpoints(rq.endpoints, working_path))
|
|
80
|
-
except Exception as e:
|
|
81
|
-
await _cleanup_directory(working_path)
|
|
82
|
-
tb = traceback.format_exc()
|
|
83
|
-
_error = f"Endpoint failure: {e} | {tb}"
|
|
84
|
-
logging.warning(_error)
|
|
85
|
-
return VectorDbResponse(success = False, error = _error)
|
|
86
|
-
|
|
87
|
-
if rq.integrations:
|
|
88
|
-
tasks = []
|
|
89
|
-
for integration in rq.integrations:
|
|
90
|
-
tasks.append(
|
|
91
|
-
IntegrationManager
|
|
92
|
-
.get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
|
|
93
|
-
.load()
|
|
94
|
-
)
|
|
95
|
-
try:
|
|
96
|
-
integration_documents = await asyncio.gather(*tasks)
|
|
97
|
-
for docs in integration_documents:
|
|
98
|
-
documents.extend(docs)
|
|
99
|
-
except Exception as e:
|
|
100
|
-
await _cleanup_directory(working_path)
|
|
101
|
-
tb = traceback.format_exc()
|
|
102
|
-
_error = f"Integration failure: {e} | {tb}"
|
|
103
|
-
logging.warning(_error)
|
|
104
|
-
return VectorDbResponse(success=False, error=_error)
|
|
105
|
-
|
|
106
|
-
#cleanup
|
|
107
|
-
await _cleanup_directory(working_path)
|
|
108
|
-
|
|
109
|
-
if documents and len(documents) > 0:
|
|
110
|
-
try:
|
|
111
|
-
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
112
|
-
db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
|
|
113
|
-
os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
|
|
114
|
-
"zip",
|
|
115
|
-
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
|
|
116
|
-
)
|
|
117
|
-
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
118
|
-
except Exception as e:
|
|
119
|
-
await _cleanup_directory(store_path)
|
|
120
|
-
return VectorDbResponse(success = False, error = str(e))
|
|
121
|
-
finally:
|
|
122
|
-
del documents
|
|
123
|
-
gc.collect()
|
|
124
|
-
else:
|
|
125
|
-
_error = "No documents found in the knowledgebase folder"
|
|
126
|
-
logging.warning(_error)
|
|
127
|
-
return VectorDbResponse(success = False, error = _error)
|
|
128
|
-
|
|
129
|
-
async def kb_stream_file(filename: str):
|
|
130
|
-
file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
|
|
131
|
-
if not os.path.isfile(file_path):
|
|
132
|
-
raise HTTPException(status_code=404, detail="File not found")
|
|
133
|
-
def iter_file():
|
|
134
|
-
with open(file_path, mode="rb") as file:
|
|
135
|
-
while chunk := file.read(1024*8):
|
|
136
|
-
yield chunk
|
|
137
|
-
return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
|
|
1
|
+
import os, gc, shutil, logging, traceback
|
|
2
|
+
import asyncio, aiofiles, aiofiles.os
|
|
3
|
+
from fastapi import HTTPException
|
|
4
|
+
from fastapi.responses import StreamingResponse
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
7
|
+
from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
|
|
8
|
+
from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
|
|
9
|
+
from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
|
|
10
|
+
from ws_bom_robot_app.config import config
|
|
11
|
+
from ws_bom_robot_app.llm.models.kb import load_endpoints
|
|
12
|
+
from ws_bom_robot_app.llm.utils.download import download_files
|
|
13
|
+
|
|
14
|
+
async def _cleanup_directory(directory_path: str):
|
|
15
|
+
if os.path.exists(directory_path):
|
|
16
|
+
await asyncio.to_thread(shutil.rmtree, directory_path)
|
|
17
|
+
|
|
18
|
+
#@timer
|
|
19
|
+
async def rules(rq: RulesRequest) -> VectorDbResponse:
|
|
20
|
+
_config = rq.config()
|
|
21
|
+
db_name = rq.out_name()
|
|
22
|
+
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
23
|
+
try:
|
|
24
|
+
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
|
|
25
|
+
db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
|
|
26
|
+
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
27
|
+
except Exception as e:
|
|
28
|
+
try:
|
|
29
|
+
await _cleanup_directory(store_path)
|
|
30
|
+
finally:
|
|
31
|
+
return VectorDbResponse(success = False, error = str(e))
|
|
32
|
+
finally:
|
|
33
|
+
gc.collect()
|
|
34
|
+
|
|
35
|
+
#@atimer
|
|
36
|
+
async def kb(rq: KbRequest) -> VectorDbResponse:
|
|
37
|
+
os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
|
|
38
|
+
_config = rq.config()
|
|
39
|
+
db_name = rq.out_name()
|
|
40
|
+
src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
|
|
41
|
+
working_path = os.path.join(src_path, db_name)
|
|
42
|
+
|
|
43
|
+
if all([not rq.files,not rq.endpoints,not rq.integrations]):
|
|
44
|
+
return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
|
|
45
|
+
else:
|
|
46
|
+
await aiofiles.os.makedirs(src_path, exist_ok=True)
|
|
47
|
+
await aiofiles.os.makedirs(working_path, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
documents: list[Document] = []
|
|
50
|
+
# Download/copy all files
|
|
51
|
+
if rq.files:
|
|
52
|
+
try:
|
|
53
|
+
loaders = Loader(working_path)
|
|
54
|
+
filter_file_extensions = loaders.managed_file_extensions()
|
|
55
|
+
files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
|
|
56
|
+
if files_to_download:
|
|
57
|
+
await download_files(
|
|
58
|
+
[f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
|
|
59
|
+
src_path, authorization=_config.robot_cms_auth)
|
|
60
|
+
# copy files to working tmp folder
|
|
61
|
+
for file in rq.files:
|
|
62
|
+
async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
|
|
63
|
+
async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
|
|
64
|
+
await dest_file.write(await src_file.read())
|
|
65
|
+
#load files
|
|
66
|
+
try:
|
|
67
|
+
documents.extend(await loaders.load())
|
|
68
|
+
except Exception as e:
|
|
69
|
+
tb = traceback.format_exc()
|
|
70
|
+
_error = f"File loader failure: {e} | {tb}"
|
|
71
|
+
logging.warning(_error)
|
|
72
|
+
return VectorDbResponse(success = False, error = _error)
|
|
73
|
+
except Exception as e:
|
|
74
|
+
await _cleanup_directory(working_path)
|
|
75
|
+
return VectorDbResponse(success = False, error = f"Failed to download file {e}")
|
|
76
|
+
|
|
77
|
+
if rq.endpoints:
|
|
78
|
+
try:
|
|
79
|
+
documents.extend(await load_endpoints(rq.endpoints, working_path))
|
|
80
|
+
except Exception as e:
|
|
81
|
+
await _cleanup_directory(working_path)
|
|
82
|
+
tb = traceback.format_exc()
|
|
83
|
+
_error = f"Endpoint failure: {e} | {tb}"
|
|
84
|
+
logging.warning(_error)
|
|
85
|
+
return VectorDbResponse(success = False, error = _error)
|
|
86
|
+
|
|
87
|
+
if rq.integrations:
|
|
88
|
+
tasks = []
|
|
89
|
+
for integration in rq.integrations:
|
|
90
|
+
tasks.append(
|
|
91
|
+
IntegrationManager
|
|
92
|
+
.get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
|
|
93
|
+
.load()
|
|
94
|
+
)
|
|
95
|
+
try:
|
|
96
|
+
integration_documents = await asyncio.gather(*tasks)
|
|
97
|
+
for docs in integration_documents:
|
|
98
|
+
documents.extend(docs)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
await _cleanup_directory(working_path)
|
|
101
|
+
tb = traceback.format_exc()
|
|
102
|
+
_error = f"Integration failure: {e} | {tb}"
|
|
103
|
+
logging.warning(_error)
|
|
104
|
+
return VectorDbResponse(success=False, error=_error)
|
|
105
|
+
|
|
106
|
+
#cleanup
|
|
107
|
+
await _cleanup_directory(working_path)
|
|
108
|
+
|
|
109
|
+
if documents and len(documents) > 0:
|
|
110
|
+
try:
|
|
111
|
+
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
112
|
+
db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
|
|
113
|
+
os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
|
|
114
|
+
"zip",
|
|
115
|
+
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
|
|
116
|
+
)
|
|
117
|
+
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
await _cleanup_directory(store_path)
|
|
120
|
+
return VectorDbResponse(success = False, error = str(e))
|
|
121
|
+
finally:
|
|
122
|
+
del documents
|
|
123
|
+
gc.collect()
|
|
124
|
+
else:
|
|
125
|
+
_error = "No documents found in the knowledgebase folder"
|
|
126
|
+
logging.warning(_error)
|
|
127
|
+
return VectorDbResponse(success = False, error = _error)
|
|
128
|
+
|
|
129
|
+
async def kb_stream_file(filename: str):
|
|
130
|
+
file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
|
|
131
|
+
if not os.path.isfile(file_path):
|
|
132
|
+
raise HTTPException(status_code=404, detail="File not found")
|
|
133
|
+
def iter_file():
|
|
134
|
+
with open(file_path, mode="rb") as file:
|
|
135
|
+
while chunk := file.read(1024*8):
|
|
136
|
+
yield chunk
|
|
137
|
+
return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
|