ws-bom-robot-app 0.0.37__py3-none-any.whl → 0.0.103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/config.py +35 -7
- ws_bom_robot_app/cron_manager.py +15 -14
- ws_bom_robot_app/llm/agent_context.py +26 -0
- ws_bom_robot_app/llm/agent_description.py +123 -123
- ws_bom_robot_app/llm/agent_handler.py +176 -180
- ws_bom_robot_app/llm/agent_lcel.py +107 -54
- ws_bom_robot_app/llm/api.py +100 -7
- ws_bom_robot_app/llm/defaut_prompt.py +15 -15
- ws_bom_robot_app/llm/evaluator.py +319 -0
- ws_bom_robot_app/llm/feedbacks/__init__.py +0 -0
- ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -0
- ws_bom_robot_app/llm/main.py +159 -110
- ws_bom_robot_app/llm/models/api.py +70 -5
- ws_bom_robot_app/llm/models/feedback.py +30 -0
- ws_bom_robot_app/llm/nebuly_handler.py +185 -0
- ws_bom_robot_app/llm/providers/llm_manager.py +244 -80
- ws_bom_robot_app/llm/tools/models/main.py +8 -0
- ws_bom_robot_app/llm/tools/tool_builder.py +68 -23
- ws_bom_robot_app/llm/tools/tool_manager.py +343 -133
- ws_bom_robot_app/llm/tools/utils.py +41 -25
- ws_bom_robot_app/llm/utils/agent.py +34 -0
- ws_bom_robot_app/llm/utils/chunker.py +6 -1
- ws_bom_robot_app/llm/utils/cleanup.py +81 -0
- ws_bom_robot_app/llm/utils/cms.py +123 -0
- ws_bom_robot_app/llm/utils/download.py +183 -79
- ws_bom_robot_app/llm/utils/print.py +29 -29
- ws_bom_robot_app/llm/vector_store/db/__init__.py +0 -0
- ws_bom_robot_app/llm/vector_store/db/base.py +193 -0
- ws_bom_robot_app/llm/vector_store/db/chroma.py +97 -0
- ws_bom_robot_app/llm/vector_store/db/faiss.py +91 -0
- ws_bom_robot_app/llm/vector_store/db/manager.py +15 -0
- ws_bom_robot_app/llm/vector_store/db/qdrant.py +73 -0
- ws_bom_robot_app/llm/vector_store/generator.py +137 -137
- ws_bom_robot_app/llm/vector_store/integration/api.py +216 -0
- ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/base.py +58 -15
- ws_bom_robot_app/llm/vector_store/integration/confluence.py +41 -11
- ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
- ws_bom_robot_app/llm/vector_store/integration/googledrive.py +46 -17
- ws_bom_robot_app/llm/vector_store/integration/jira.py +112 -75
- ws_bom_robot_app/llm/vector_store/integration/manager.py +6 -2
- ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
- ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
- ws_bom_robot_app/llm/vector_store/integration/sitemap.py +9 -1
- ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
- ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -0
- ws_bom_robot_app/llm/vector_store/loader/base.py +52 -8
- ws_bom_robot_app/llm/vector_store/loader/docling.py +71 -33
- ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
- ws_bom_robot_app/main.py +148 -146
- ws_bom_robot_app/subprocess_runner.py +106 -0
- ws_bom_robot_app/task_manager.py +207 -54
- ws_bom_robot_app/util.py +65 -20
- ws_bom_robot_app-0.0.103.dist-info/METADATA +364 -0
- ws_bom_robot_app-0.0.103.dist-info/RECORD +76 -0
- {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/WHEEL +1 -1
- ws_bom_robot_app/llm/settings.py +0 -4
- ws_bom_robot_app/llm/utils/agent_utils.py +0 -17
- ws_bom_robot_app/llm/utils/kb.py +0 -34
- ws_bom_robot_app-0.0.37.dist-info/METADATA +0 -277
- ws_bom_robot_app-0.0.37.dist-info/RECORD +0 -60
- {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/top_level.txt +0 -0
|
@@ -1,137 +1,137 @@
|
|
|
1
|
-
import os, gc, shutil, logging, traceback
|
|
2
|
-
import asyncio, aiofiles, aiofiles.os
|
|
3
|
-
from fastapi import HTTPException
|
|
4
|
-
from fastapi.responses import StreamingResponse
|
|
5
|
-
from langchain_core.documents import Document
|
|
6
|
-
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
7
|
-
from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
|
|
8
|
-
from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
|
|
9
|
-
from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
|
|
10
|
-
from ws_bom_robot_app.config import config
|
|
11
|
-
from ws_bom_robot_app.llm.models.kb import load_endpoints
|
|
12
|
-
from ws_bom_robot_app.llm.utils.download import download_files
|
|
13
|
-
|
|
14
|
-
async def _cleanup_directory(directory_path: str):
|
|
15
|
-
if os.path.exists(directory_path):
|
|
16
|
-
await asyncio.to_thread(shutil.rmtree, directory_path)
|
|
17
|
-
|
|
18
|
-
#@timer
|
|
19
|
-
async def rules(rq: RulesRequest) -> VectorDbResponse:
|
|
20
|
-
_config = rq.config()
|
|
21
|
-
db_name = rq.out_name()
|
|
22
|
-
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
23
|
-
try:
|
|
24
|
-
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
|
|
25
|
-
db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
|
|
26
|
-
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
27
|
-
except Exception as e:
|
|
28
|
-
try:
|
|
29
|
-
await _cleanup_directory(store_path)
|
|
30
|
-
finally:
|
|
31
|
-
return VectorDbResponse(success = False, error = str(e))
|
|
32
|
-
finally:
|
|
33
|
-
gc.collect()
|
|
34
|
-
|
|
35
|
-
#@atimer
|
|
36
|
-
async def kb(rq: KbRequest) -> VectorDbResponse:
|
|
37
|
-
os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
|
|
38
|
-
_config = rq.config()
|
|
39
|
-
db_name = rq.out_name()
|
|
40
|
-
src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
|
|
41
|
-
working_path = os.path.join(src_path, db_name)
|
|
42
|
-
|
|
43
|
-
if all([not rq.files,not rq.endpoints,not rq.integrations]):
|
|
44
|
-
return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
|
|
45
|
-
else:
|
|
46
|
-
await aiofiles.os.makedirs(src_path, exist_ok=True)
|
|
47
|
-
await aiofiles.os.makedirs(working_path, exist_ok=True)
|
|
48
|
-
|
|
49
|
-
documents: list[Document] = []
|
|
50
|
-
# Download/copy all files
|
|
51
|
-
if rq.files:
|
|
52
|
-
try:
|
|
53
|
-
loaders = Loader(working_path)
|
|
54
|
-
filter_file_extensions = loaders.managed_file_extensions()
|
|
55
|
-
files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
|
|
56
|
-
if files_to_download:
|
|
57
|
-
await download_files(
|
|
58
|
-
[f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
|
|
59
|
-
src_path, authorization=_config.robot_cms_auth)
|
|
60
|
-
# copy files to working tmp folder
|
|
61
|
-
for file in rq.files:
|
|
62
|
-
async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
|
|
63
|
-
async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
|
|
64
|
-
await dest_file.write(await src_file.read())
|
|
65
|
-
#load files
|
|
66
|
-
try:
|
|
67
|
-
documents.extend(await loaders.load())
|
|
68
|
-
except Exception as e:
|
|
69
|
-
tb = traceback.format_exc()
|
|
70
|
-
_error = f"File loader failure: {e} | {tb}"
|
|
71
|
-
logging.warning(_error)
|
|
72
|
-
return VectorDbResponse(success = False, error = _error)
|
|
73
|
-
except Exception as e:
|
|
74
|
-
await _cleanup_directory(working_path)
|
|
75
|
-
return VectorDbResponse(success = False, error = f"Failed to download file {e}")
|
|
76
|
-
|
|
77
|
-
if rq.endpoints:
|
|
78
|
-
try:
|
|
79
|
-
documents.extend(await load_endpoints(rq.endpoints, working_path))
|
|
80
|
-
except Exception as e:
|
|
81
|
-
await _cleanup_directory(working_path)
|
|
82
|
-
tb = traceback.format_exc()
|
|
83
|
-
_error = f"Endpoint failure: {e} | {tb}"
|
|
84
|
-
logging.warning(_error)
|
|
85
|
-
return VectorDbResponse(success = False, error = _error)
|
|
86
|
-
|
|
87
|
-
if rq.integrations:
|
|
88
|
-
tasks = []
|
|
89
|
-
for integration in rq.integrations:
|
|
90
|
-
tasks.append(
|
|
91
|
-
IntegrationManager
|
|
92
|
-
.get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
|
|
93
|
-
.load()
|
|
94
|
-
)
|
|
95
|
-
try:
|
|
96
|
-
integration_documents = await asyncio.gather(*tasks)
|
|
97
|
-
for docs in integration_documents:
|
|
98
|
-
documents.extend(docs)
|
|
99
|
-
except Exception as e:
|
|
100
|
-
await _cleanup_directory(working_path)
|
|
101
|
-
tb = traceback.format_exc()
|
|
102
|
-
_error = f"Integration failure: {e} | {tb}"
|
|
103
|
-
logging.warning(_error)
|
|
104
|
-
return VectorDbResponse(success=False, error=_error)
|
|
105
|
-
|
|
106
|
-
#cleanup
|
|
107
|
-
await _cleanup_directory(working_path)
|
|
108
|
-
|
|
109
|
-
if documents and len(documents) > 0:
|
|
110
|
-
try:
|
|
111
|
-
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
112
|
-
db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
|
|
113
|
-
os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
|
|
114
|
-
"zip",
|
|
115
|
-
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
|
|
116
|
-
)
|
|
117
|
-
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
118
|
-
except Exception as e:
|
|
119
|
-
await _cleanup_directory(store_path)
|
|
120
|
-
return VectorDbResponse(success = False, error = str(e))
|
|
121
|
-
finally:
|
|
122
|
-
del documents
|
|
123
|
-
gc.collect()
|
|
124
|
-
else:
|
|
125
|
-
_error = "No documents found in the knowledgebase folder"
|
|
126
|
-
logging.warning(_error)
|
|
127
|
-
return VectorDbResponse(success = False, error = _error)
|
|
128
|
-
|
|
129
|
-
async def kb_stream_file(filename: str):
|
|
130
|
-
file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
|
|
131
|
-
if not os.path.isfile(file_path):
|
|
132
|
-
raise HTTPException(status_code=404, detail="File not found")
|
|
133
|
-
def iter_file():
|
|
134
|
-
with open(file_path, mode="rb") as file:
|
|
135
|
-
while chunk := file.read(1024*8):
|
|
136
|
-
yield chunk
|
|
137
|
-
return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
|
|
1
|
+
import os, gc, shutil, logging, traceback
|
|
2
|
+
import asyncio, aiofiles, aiofiles.os
|
|
3
|
+
from fastapi import HTTPException
|
|
4
|
+
from fastapi.responses import StreamingResponse
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
7
|
+
from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
|
|
8
|
+
from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
|
|
9
|
+
from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
|
|
10
|
+
from ws_bom_robot_app.config import config
|
|
11
|
+
from ws_bom_robot_app.llm.models.kb import load_endpoints
|
|
12
|
+
from ws_bom_robot_app.llm.utils.download import download_files
|
|
13
|
+
|
|
14
|
+
async def _cleanup_directory(directory_path: str):
|
|
15
|
+
if os.path.exists(directory_path):
|
|
16
|
+
await asyncio.to_thread(shutil.rmtree, directory_path)
|
|
17
|
+
|
|
18
|
+
#@timer
|
|
19
|
+
async def rules(rq: RulesRequest) -> VectorDbResponse:
|
|
20
|
+
_config = rq.config()
|
|
21
|
+
db_name = rq.out_name()
|
|
22
|
+
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
23
|
+
try:
|
|
24
|
+
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
|
|
25
|
+
db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
|
|
26
|
+
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
27
|
+
except Exception as e:
|
|
28
|
+
try:
|
|
29
|
+
await _cleanup_directory(store_path)
|
|
30
|
+
finally:
|
|
31
|
+
return VectorDbResponse(success = False, error = str(e))
|
|
32
|
+
finally:
|
|
33
|
+
gc.collect()
|
|
34
|
+
|
|
35
|
+
#@atimer
|
|
36
|
+
async def kb(rq: KbRequest) -> VectorDbResponse:
|
|
37
|
+
os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
|
|
38
|
+
_config = rq.config()
|
|
39
|
+
db_name = rq.out_name()
|
|
40
|
+
src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
|
|
41
|
+
working_path = os.path.join(src_path, db_name)
|
|
42
|
+
|
|
43
|
+
if all([not rq.files,not rq.endpoints,not rq.integrations]):
|
|
44
|
+
return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
|
|
45
|
+
else:
|
|
46
|
+
await aiofiles.os.makedirs(src_path, exist_ok=True)
|
|
47
|
+
await aiofiles.os.makedirs(working_path, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
documents: list[Document] = []
|
|
50
|
+
# Download/copy all files
|
|
51
|
+
if rq.files:
|
|
52
|
+
try:
|
|
53
|
+
loaders = Loader(working_path)
|
|
54
|
+
filter_file_extensions = loaders.managed_file_extensions()
|
|
55
|
+
files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
|
|
56
|
+
if files_to_download:
|
|
57
|
+
await download_files(
|
|
58
|
+
[f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
|
|
59
|
+
src_path, authorization=_config.robot_cms_auth)
|
|
60
|
+
# copy files to working tmp folder
|
|
61
|
+
for file in rq.files:
|
|
62
|
+
async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
|
|
63
|
+
async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
|
|
64
|
+
await dest_file.write(await src_file.read())
|
|
65
|
+
#load files
|
|
66
|
+
try:
|
|
67
|
+
documents.extend(await loaders.load())
|
|
68
|
+
except Exception as e:
|
|
69
|
+
tb = traceback.format_exc()
|
|
70
|
+
_error = f"File loader failure: {e} | {tb}"
|
|
71
|
+
logging.warning(_error)
|
|
72
|
+
return VectorDbResponse(success = False, error = _error)
|
|
73
|
+
except Exception as e:
|
|
74
|
+
await _cleanup_directory(working_path)
|
|
75
|
+
return VectorDbResponse(success = False, error = f"Failed to download file {e}")
|
|
76
|
+
|
|
77
|
+
if rq.endpoints:
|
|
78
|
+
try:
|
|
79
|
+
documents.extend(await load_endpoints(rq.endpoints, working_path))
|
|
80
|
+
except Exception as e:
|
|
81
|
+
await _cleanup_directory(working_path)
|
|
82
|
+
tb = traceback.format_exc()
|
|
83
|
+
_error = f"Endpoint failure: {e} | {tb}"
|
|
84
|
+
logging.warning(_error)
|
|
85
|
+
return VectorDbResponse(success = False, error = _error)
|
|
86
|
+
|
|
87
|
+
if rq.integrations:
|
|
88
|
+
tasks = []
|
|
89
|
+
for integration in rq.integrations:
|
|
90
|
+
tasks.append(
|
|
91
|
+
IntegrationManager
|
|
92
|
+
.get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
|
|
93
|
+
.load()
|
|
94
|
+
)
|
|
95
|
+
try:
|
|
96
|
+
integration_documents = await asyncio.gather(*tasks)
|
|
97
|
+
for docs in integration_documents:
|
|
98
|
+
documents.extend(docs)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
await _cleanup_directory(working_path)
|
|
101
|
+
tb = traceback.format_exc()
|
|
102
|
+
_error = f"Integration failure: {e} | {tb}"
|
|
103
|
+
logging.warning(_error)
|
|
104
|
+
return VectorDbResponse(success=False, error=_error)
|
|
105
|
+
|
|
106
|
+
#cleanup
|
|
107
|
+
await _cleanup_directory(working_path)
|
|
108
|
+
|
|
109
|
+
if documents and len(documents) > 0:
|
|
110
|
+
try:
|
|
111
|
+
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
112
|
+
db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
|
|
113
|
+
os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
|
|
114
|
+
"zip",
|
|
115
|
+
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
|
|
116
|
+
)
|
|
117
|
+
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
await _cleanup_directory(store_path)
|
|
120
|
+
return VectorDbResponse(success = False, error = str(e))
|
|
121
|
+
finally:
|
|
122
|
+
del documents
|
|
123
|
+
gc.collect()
|
|
124
|
+
else:
|
|
125
|
+
_error = "No documents found in the knowledgebase folder"
|
|
126
|
+
logging.warning(_error)
|
|
127
|
+
return VectorDbResponse(success = False, error = _error)
|
|
128
|
+
|
|
129
|
+
async def kb_stream_file(filename: str):
|
|
130
|
+
file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
|
|
131
|
+
if not os.path.isfile(file_path):
|
|
132
|
+
raise HTTPException(status_code=404, detail="File not found")
|
|
133
|
+
def iter_file():
|
|
134
|
+
with open(file_path, mode="rb") as file:
|
|
135
|
+
while chunk := file.read(1024*8):
|
|
136
|
+
yield chunk
|
|
137
|
+
return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import asyncio, logging, aiohttp
|
|
2
|
+
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
|
|
3
|
+
from langchain_core.documents import Document
|
|
4
|
+
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
5
|
+
from typing import List, Union, Optional, Dict, Any, Literal
|
|
6
|
+
from pydantic import BaseModel, Field, AliasChoices, field_validator
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AuthConfig(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
Configuration for API authentication.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
type: Type of authentication (bearer, basic, api_key, custom, none)
|
|
17
|
+
token: Bearer token or API key value
|
|
18
|
+
username: Username for basic auth
|
|
19
|
+
password: Password for basic auth
|
|
20
|
+
header_name: Custom header name for API key
|
|
21
|
+
prefix: Prefix for the auth value (e.g., 'Bearer', 'Token')
|
|
22
|
+
"""
|
|
23
|
+
type: Literal["bearer", "basic", "api_key", "custom", "none"] = Field(default="none")
|
|
24
|
+
token: Optional[str] = Field(default=None)
|
|
25
|
+
username: Optional[str] = Field(default=None)
|
|
26
|
+
password: Optional[str] = Field(default=None)
|
|
27
|
+
header_name: Optional[str] = Field(default=None, validation_alias=AliasChoices("headerName", "header_name"))
|
|
28
|
+
prefix: Optional[str] = Field(default=None)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ApiParams(BaseModel):
|
|
32
|
+
"""
|
|
33
|
+
Generic API Integration Parameters.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
url: The base URL of the API endpoint
|
|
37
|
+
method: HTTP method (GET, POST, PUT, DELETE, PATCH)
|
|
38
|
+
headers: Custom headers to include in the request
|
|
39
|
+
params: Query parameters for the request
|
|
40
|
+
body: Request body for POST/PUT/PATCH requests
|
|
41
|
+
auth: Authentication configuration
|
|
42
|
+
response_data_path: JSON path to extract data from response (e.g., 'data.items', 'results')
|
|
43
|
+
max_retries: Maximum number of retry attempts for failed requests
|
|
44
|
+
retry_delay: Base delay in seconds between retries (uses exponential backoff)
|
|
45
|
+
timeout: Request timeout in seconds
|
|
46
|
+
"""
|
|
47
|
+
url: str = Field(validation_alias=AliasChoices("url", "endpoint"))
|
|
48
|
+
method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = Field(default="GET")
|
|
49
|
+
headers: Optional[Dict[str, str]] = Field(default_factory=dict)
|
|
50
|
+
params: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
|
51
|
+
body: Optional[Union[Dict[str, Any], str]] = Field(default=None)
|
|
52
|
+
auth: Optional[AuthConfig] = Field(default_factory=lambda: AuthConfig())
|
|
53
|
+
response_data_path: Optional[str] = Field(default=None, validation_alias=AliasChoices("responseDataPath", "response_data_path"))
|
|
54
|
+
max_retries: int = Field(default=5, validation_alias=AliasChoices("maxRetries", "max_retries"))
|
|
55
|
+
retry_delay: float = Field(default=1.0, validation_alias=AliasChoices("retryDelay", "retry_delay"))
|
|
56
|
+
timeout: int = Field(default=30)
|
|
57
|
+
|
|
58
|
+
@field_validator('auth', mode='before')
|
|
59
|
+
@classmethod
|
|
60
|
+
def parse_auth(cls, v):
|
|
61
|
+
"""Parse auth config from dict if needed"""
|
|
62
|
+
if isinstance(v, dict):
|
|
63
|
+
return AuthConfig(**v)
|
|
64
|
+
return v or AuthConfig()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Api(IntegrationStrategy):
|
|
68
|
+
"""
|
|
69
|
+
Generic API Integration that supports:
|
|
70
|
+
- Multiple HTTP methods (GET, POST, PUT, DELETE, PATCH)
|
|
71
|
+
- Various authentication types (Bearer, Basic, API Key, Custom)
|
|
72
|
+
- Custom headers and parameters
|
|
73
|
+
- Automatic retry with exponential backoff
|
|
74
|
+
- Flexible response data extraction
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(self, knowledgebase_path: str, data: dict[str, Union[str, int, list]]):
|
|
78
|
+
super().__init__(knowledgebase_path, data)
|
|
79
|
+
self.__data = ApiParams.model_validate(self.data)
|
|
80
|
+
|
|
81
|
+
def working_subdirectory(self) -> str:
|
|
82
|
+
return 'api_integration'
|
|
83
|
+
|
|
84
|
+
async def run(self) -> None:
|
|
85
|
+
"""Fetch data from the API and save to JSON file"""
|
|
86
|
+
_data = await self.__fetch_data()
|
|
87
|
+
json_file_path = os.path.join(self.working_directory, 'api_data.json')
|
|
88
|
+
with open(json_file_path, 'w', encoding='utf-8') as f:
|
|
89
|
+
json.dump(_data, f, ensure_ascii=False, indent=2)
|
|
90
|
+
logging.info(f"Saved {len(_data) if isinstance(_data, list) else 1} items to {json_file_path}")
|
|
91
|
+
|
|
92
|
+
async def load(self) -> list[Document]:
|
|
93
|
+
"""Load data from API and convert to documents"""
|
|
94
|
+
await self.run()
|
|
95
|
+
await asyncio.sleep(1)
|
|
96
|
+
return await Loader(self.working_directory).load()
|
|
97
|
+
|
|
98
|
+
def __prepare_headers(self) -> Dict[str, str]:
|
|
99
|
+
"""Prepare request headers with authentication"""
|
|
100
|
+
headers = self.__data.headers.copy() if self.__data.headers else {}
|
|
101
|
+
|
|
102
|
+
# Add Content-Type if not present
|
|
103
|
+
if 'Content-Type' not in headers and self.__data.method in ["POST", "PUT", "PATCH"]:
|
|
104
|
+
headers['Content-Type'] = 'application/json'
|
|
105
|
+
|
|
106
|
+
# Add authentication
|
|
107
|
+
auth = self.__data.auth
|
|
108
|
+
if auth.type == "bearer":
|
|
109
|
+
prefix = auth.prefix or "Bearer"
|
|
110
|
+
headers['Authorization'] = f"{prefix} {auth.token}"
|
|
111
|
+
elif auth.type == "basic":
|
|
112
|
+
import base64
|
|
113
|
+
credentials = f"{auth.username}:{auth.password}"
|
|
114
|
+
encoded = base64.b64encode(credentials.encode()).decode()
|
|
115
|
+
headers['Authorization'] = f"Basic {encoded}"
|
|
116
|
+
elif auth.type == "api_key" and auth.header_name:
|
|
117
|
+
prefix = f"{auth.prefix} " if auth.prefix else ""
|
|
118
|
+
headers[auth.header_name] = f"{prefix}{auth.token}"
|
|
119
|
+
|
|
120
|
+
return headers
|
|
121
|
+
|
|
122
|
+
def __get_nested_value(self, data: Any, path: Optional[str]) -> Any:
|
|
123
|
+
"""Extract nested value from data using dot notation path"""
|
|
124
|
+
if not path:
|
|
125
|
+
return data
|
|
126
|
+
|
|
127
|
+
keys = path.split('.')
|
|
128
|
+
current = data
|
|
129
|
+
for key in keys:
|
|
130
|
+
if isinstance(current, dict):
|
|
131
|
+
current = current.get(key)
|
|
132
|
+
elif isinstance(current, list) and key.isdigit():
|
|
133
|
+
current = current[int(key)]
|
|
134
|
+
else:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
if current is None:
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
return current
|
|
141
|
+
|
|
142
|
+
async def __make_request(
|
|
143
|
+
self,
|
|
144
|
+
url: str,
|
|
145
|
+
headers: Dict[str, str],
|
|
146
|
+
params: Optional[Dict[str, Any]] = None
|
|
147
|
+
) -> Dict[str, Any]:
|
|
148
|
+
"""Make HTTP request with retry logic"""
|
|
149
|
+
retry_count = 0
|
|
150
|
+
|
|
151
|
+
while retry_count <= self.__data.max_retries:
|
|
152
|
+
try:
|
|
153
|
+
timeout = aiohttp.ClientTimeout(total=self.__data.timeout)
|
|
154
|
+
|
|
155
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
156
|
+
request_kwargs = {
|
|
157
|
+
"headers": headers,
|
|
158
|
+
"params": params or self.__data.params
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# Add body for POST/PUT/PATCH
|
|
162
|
+
if self.__data.method in ["POST", "PUT", "PATCH"] and self.__data.body:
|
|
163
|
+
if isinstance(self.__data.body, dict):
|
|
164
|
+
request_kwargs["json"] = self.__data.body
|
|
165
|
+
else:
|
|
166
|
+
request_kwargs["data"] = self.__data.body
|
|
167
|
+
|
|
168
|
+
async with session.request(
|
|
169
|
+
self.__data.method,
|
|
170
|
+
url,
|
|
171
|
+
**request_kwargs
|
|
172
|
+
) as response:
|
|
173
|
+
# Check response status
|
|
174
|
+
if response.status == 429: # Rate limit
|
|
175
|
+
retry_count += 1
|
|
176
|
+
if retry_count > self.__data.max_retries:
|
|
177
|
+
raise Exception("Rate limit exceeded. Maximum retries reached.")
|
|
178
|
+
|
|
179
|
+
wait_time = self.__data.retry_delay * (2 ** retry_count)
|
|
180
|
+
logging.warning(f"Rate limited. Waiting {wait_time}s (Attempt {retry_count}/{self.__data.max_retries})")
|
|
181
|
+
await asyncio.sleep(wait_time)
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
response.raise_for_status()
|
|
185
|
+
|
|
186
|
+
# Parse response
|
|
187
|
+
try:
|
|
188
|
+
data = await response.json()
|
|
189
|
+
return data
|
|
190
|
+
except aiohttp.ContentTypeError:
|
|
191
|
+
text = await response.text()
|
|
192
|
+
logging.warning(f"Non-JSON response received: {text[:200]}")
|
|
193
|
+
return {"text": text}
|
|
194
|
+
|
|
195
|
+
except aiohttp.ClientError as e:
|
|
196
|
+
retry_count += 1
|
|
197
|
+
if retry_count > self.__data.max_retries:
|
|
198
|
+
raise Exception(f"Request failed after {self.__data.max_retries} retries: {e}")
|
|
199
|
+
|
|
200
|
+
wait_time = self.__data.retry_delay * (2 ** retry_count)
|
|
201
|
+
logging.warning(f"Request error: {e}. Retrying in {wait_time}s...")
|
|
202
|
+
await asyncio.sleep(wait_time)
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
raise Exception("Maximum retries exceeded")
|
|
206
|
+
|
|
207
|
+
async def __fetch_data(self) -> Any:
|
|
208
|
+
"""Fetch data from API"""
|
|
209
|
+
headers = self.__prepare_headers()
|
|
210
|
+
response = await self.__make_request(self.__data.url, headers)
|
|
211
|
+
|
|
212
|
+
# Extract data from response using path if specified
|
|
213
|
+
data = self.__get_nested_value(response, self.__data.response_data_path)
|
|
214
|
+
result = data if data is not None else response
|
|
215
|
+
|
|
216
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Union, Optional
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
import os
|
|
1
|
+
import os, copy
|
|
2
|
+
from random import random
|
|
2
3
|
from langchain_core.documents import Document
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
|
-
from unstructured_ingest.
|
|
5
|
-
from unstructured_ingest.
|
|
5
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
6
|
+
from unstructured_ingest.pipeline.pipeline import (
|
|
7
|
+
Pipeline,
|
|
8
|
+
PartitionerConfig,
|
|
9
|
+
FiltererConfig
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.processes.connector_registry import source_registry
|
|
6
12
|
from typing import Union
|
|
7
13
|
from ws_bom_robot_app.llm.utils.secrets import Secrets
|
|
14
|
+
from ws_bom_robot_app.config import config
|
|
8
15
|
|
|
9
16
|
class IntegrationStrategy(ABC):
|
|
10
17
|
@classmethod
|
|
@@ -32,23 +39,59 @@ class IntegrationStrategy(ABC):
|
|
|
32
39
|
pass
|
|
33
40
|
|
|
34
41
|
class UnstructuredIngest():
|
|
42
|
+
_PIPELINE: Pipeline = None
|
|
35
43
|
def __init__(self, working_directory: str):
|
|
36
44
|
self.working_directory = working_directory
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
45
|
+
self._runtime_options = config.runtime_options()
|
|
46
|
+
def pipeline(self,indexer_config,downloader_config,connection_config,extension: list[str] = None) -> Pipeline:
|
|
47
|
+
def _default_processor_config() -> ProcessorConfig:
|
|
48
|
+
return ProcessorConfig(
|
|
40
49
|
reprocess=False,
|
|
41
50
|
verbose=False,
|
|
42
51
|
tqdm=False,
|
|
43
|
-
num_processes=
|
|
52
|
+
num_processes=config.robot_ingest_max_threads, #safe choice to 1, avoid potential process-related issues with Docker
|
|
53
|
+
disable_parallelism=False,
|
|
44
54
|
preserve_downloads=True,
|
|
45
55
|
download_only=True,
|
|
46
|
-
raise_on_error=False
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
56
|
+
raise_on_error=False,
|
|
57
|
+
iter_delete=True,
|
|
58
|
+
delete_cache=False #already managed by the generator task
|
|
59
|
+
)
|
|
60
|
+
def _init_pipeline() -> Pipeline:
|
|
61
|
+
return Pipeline.from_configs(
|
|
62
|
+
context=_default_processor_config(),
|
|
63
|
+
indexer_config=indexer_config,
|
|
64
|
+
downloader_config=downloader_config,
|
|
65
|
+
source_connection_config=connection_config,
|
|
66
|
+
partitioner_config=PartitionerConfig(),
|
|
67
|
+
filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
|
|
68
|
+
)
|
|
69
|
+
def _instance_pipeline() -> Pipeline:
|
|
70
|
+
from unstructured_ingest.pipeline.steps.index import IndexStep
|
|
71
|
+
from unstructured_ingest.pipeline.steps.download import DownloadStep
|
|
72
|
+
from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
|
|
73
|
+
_context = _default_processor_config()
|
|
74
|
+
source_entry = {
|
|
75
|
+
k: v
|
|
76
|
+
for k, v in source_registry.items()
|
|
77
|
+
if type(indexer_config) is v.indexer_config
|
|
78
|
+
and type(downloader_config) is v.downloader_config
|
|
79
|
+
and type(connection_config) is v.connection_config
|
|
80
|
+
}
|
|
81
|
+
source = list(source_entry.values())[0]
|
|
82
|
+
_pipeline = copy.deepcopy(UnstructuredIngest._PIPELINE)
|
|
83
|
+
_pipeline.context = _context
|
|
84
|
+
_pipeline.context.work_dir = f"{self.working_directory}_unstructured" # use sibling directory, cleaned up by the generator task
|
|
85
|
+
_pipeline.indexer_step = IndexStep(process=source.indexer(index_config=indexer_config, connection_config=connection_config), context=_context)
|
|
86
|
+
_pipeline.downloader_step = DownloadStep(process=source.downloader(download_config=downloader_config, connection_config=connection_config), context=_context)
|
|
87
|
+
_pipeline.filter_step = FilterStep(process=Filterer(config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)), context=_context) if extension else None
|
|
88
|
+
return _pipeline
|
|
54
89
|
|
|
90
|
+
if not UnstructuredIngest._PIPELINE:
|
|
91
|
+
import random
|
|
92
|
+
import time
|
|
93
|
+
time.sleep(random.uniform(0.2, 1))
|
|
94
|
+
if not UnstructuredIngest._PIPELINE:
|
|
95
|
+
UnstructuredIngest._PIPELINE = _init_pipeline()
|
|
96
|
+
|
|
97
|
+
return _instance_pipeline()
|