ws-bom-robot-app 0.0.87__py3-none-any.whl → 0.0.89__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/llm/agent_description.py +123 -123
- ws_bom_robot_app/llm/agent_handler.py +174 -174
- ws_bom_robot_app/llm/agent_lcel.py +50 -50
- ws_bom_robot_app/llm/defaut_prompt.py +15 -15
- ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -66
- ws_bom_robot_app/llm/main.py +158 -158
- ws_bom_robot_app/llm/models/feedback.py +30 -30
- ws_bom_robot_app/llm/nebuly_handler.py +185 -185
- ws_bom_robot_app/llm/tools/tool_builder.py +68 -68
- ws_bom_robot_app/llm/tools/tool_manager.py +332 -332
- ws_bom_robot_app/llm/tools/utils.py +41 -41
- ws_bom_robot_app/llm/utils/agent.py +34 -34
- ws_bom_robot_app/llm/utils/cms.py +114 -114
- ws_bom_robot_app/llm/utils/download.py +183 -183
- ws_bom_robot_app/llm/utils/print.py +29 -29
- ws_bom_robot_app/llm/vector_store/db/base.py +3 -0
- ws_bom_robot_app/llm/vector_store/db/chroma.py +1 -0
- ws_bom_robot_app/llm/vector_store/db/faiss.py +1 -0
- ws_bom_robot_app/llm/vector_store/db/qdrant.py +1 -0
- ws_bom_robot_app/llm/vector_store/generator.py +137 -137
- ws_bom_robot_app/llm/vector_store/integration/confluence.py +33 -5
- ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -143
- ws_bom_robot_app/llm/vector_store/integration/sitemap.py +3 -1
- ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -102
- ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
- {ws_bom_robot_app-0.0.87.dist-info → ws_bom_robot_app-0.0.89.dist-info}/METADATA +2 -2
- {ws_bom_robot_app-0.0.87.dist-info → ws_bom_robot_app-0.0.89.dist-info}/RECORD +29 -29
- {ws_bom_robot_app-0.0.87.dist-info → ws_bom_robot_app-0.0.89.dist-info}/WHEEL +0 -0
- {ws_bom_robot_app-0.0.87.dist-info → ws_bom_robot_app-0.0.89.dist-info}/top_level.txt +0 -0
|
@@ -1,137 +1,137 @@
|
|
|
1
|
-
import os, gc, shutil, logging, traceback
|
|
2
|
-
import asyncio, aiofiles, aiofiles.os
|
|
3
|
-
from fastapi import HTTPException
|
|
4
|
-
from fastapi.responses import StreamingResponse
|
|
5
|
-
from langchain_core.documents import Document
|
|
6
|
-
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
7
|
-
from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
|
|
8
|
-
from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
|
|
9
|
-
from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
|
|
10
|
-
from ws_bom_robot_app.config import config
|
|
11
|
-
from ws_bom_robot_app.llm.models.kb import load_endpoints
|
|
12
|
-
from ws_bom_robot_app.llm.utils.download import download_files
|
|
13
|
-
|
|
14
|
-
async def _cleanup_directory(directory_path: str):
|
|
15
|
-
if os.path.exists(directory_path):
|
|
16
|
-
await asyncio.to_thread(shutil.rmtree, directory_path)
|
|
17
|
-
|
|
18
|
-
#@timer
|
|
19
|
-
async def rules(rq: RulesRequest) -> VectorDbResponse:
|
|
20
|
-
_config = rq.config()
|
|
21
|
-
db_name = rq.out_name()
|
|
22
|
-
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
23
|
-
try:
|
|
24
|
-
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
|
|
25
|
-
db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
|
|
26
|
-
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
27
|
-
except Exception as e:
|
|
28
|
-
try:
|
|
29
|
-
await _cleanup_directory(store_path)
|
|
30
|
-
finally:
|
|
31
|
-
return VectorDbResponse(success = False, error = str(e))
|
|
32
|
-
finally:
|
|
33
|
-
gc.collect()
|
|
34
|
-
|
|
35
|
-
#@atimer
|
|
36
|
-
async def kb(rq: KbRequest) -> VectorDbResponse:
|
|
37
|
-
os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
|
|
38
|
-
_config = rq.config()
|
|
39
|
-
db_name = rq.out_name()
|
|
40
|
-
src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
|
|
41
|
-
working_path = os.path.join(src_path, db_name)
|
|
42
|
-
|
|
43
|
-
if all([not rq.files,not rq.endpoints,not rq.integrations]):
|
|
44
|
-
return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
|
|
45
|
-
else:
|
|
46
|
-
await aiofiles.os.makedirs(src_path, exist_ok=True)
|
|
47
|
-
await aiofiles.os.makedirs(working_path, exist_ok=True)
|
|
48
|
-
|
|
49
|
-
documents: list[Document] = []
|
|
50
|
-
# Download/copy all files
|
|
51
|
-
if rq.files:
|
|
52
|
-
try:
|
|
53
|
-
loaders = Loader(working_path)
|
|
54
|
-
filter_file_extensions = loaders.managed_file_extensions()
|
|
55
|
-
files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
|
|
56
|
-
if files_to_download:
|
|
57
|
-
await download_files(
|
|
58
|
-
[f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
|
|
59
|
-
src_path, authorization=_config.robot_cms_auth)
|
|
60
|
-
# copy files to working tmp folder
|
|
61
|
-
for file in rq.files:
|
|
62
|
-
async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
|
|
63
|
-
async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
|
|
64
|
-
await dest_file.write(await src_file.read())
|
|
65
|
-
#load files
|
|
66
|
-
try:
|
|
67
|
-
documents.extend(await loaders.load())
|
|
68
|
-
except Exception as e:
|
|
69
|
-
tb = traceback.format_exc()
|
|
70
|
-
_error = f"File loader failure: {e} | {tb}"
|
|
71
|
-
logging.warning(_error)
|
|
72
|
-
return VectorDbResponse(success = False, error = _error)
|
|
73
|
-
except Exception as e:
|
|
74
|
-
await _cleanup_directory(working_path)
|
|
75
|
-
return VectorDbResponse(success = False, error = f"Failed to download file {e}")
|
|
76
|
-
|
|
77
|
-
if rq.endpoints:
|
|
78
|
-
try:
|
|
79
|
-
documents.extend(await load_endpoints(rq.endpoints, working_path))
|
|
80
|
-
except Exception as e:
|
|
81
|
-
await _cleanup_directory(working_path)
|
|
82
|
-
tb = traceback.format_exc()
|
|
83
|
-
_error = f"Endpoint failure: {e} | {tb}"
|
|
84
|
-
logging.warning(_error)
|
|
85
|
-
return VectorDbResponse(success = False, error = _error)
|
|
86
|
-
|
|
87
|
-
if rq.integrations:
|
|
88
|
-
tasks = []
|
|
89
|
-
for integration in rq.integrations:
|
|
90
|
-
tasks.append(
|
|
91
|
-
IntegrationManager
|
|
92
|
-
.get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
|
|
93
|
-
.load()
|
|
94
|
-
)
|
|
95
|
-
try:
|
|
96
|
-
integration_documents = await asyncio.gather(*tasks)
|
|
97
|
-
for docs in integration_documents:
|
|
98
|
-
documents.extend(docs)
|
|
99
|
-
except Exception as e:
|
|
100
|
-
await _cleanup_directory(working_path)
|
|
101
|
-
tb = traceback.format_exc()
|
|
102
|
-
_error = f"Integration failure: {e} | {tb}"
|
|
103
|
-
logging.warning(_error)
|
|
104
|
-
return VectorDbResponse(success=False, error=_error)
|
|
105
|
-
|
|
106
|
-
#cleanup
|
|
107
|
-
await _cleanup_directory(working_path)
|
|
108
|
-
|
|
109
|
-
if documents and len(documents) > 0:
|
|
110
|
-
try:
|
|
111
|
-
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
112
|
-
db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
|
|
113
|
-
os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
|
|
114
|
-
"zip",
|
|
115
|
-
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
|
|
116
|
-
)
|
|
117
|
-
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
118
|
-
except Exception as e:
|
|
119
|
-
await _cleanup_directory(store_path)
|
|
120
|
-
return VectorDbResponse(success = False, error = str(e))
|
|
121
|
-
finally:
|
|
122
|
-
del documents
|
|
123
|
-
gc.collect()
|
|
124
|
-
else:
|
|
125
|
-
_error = "No documents found in the knowledgebase folder"
|
|
126
|
-
logging.warning(_error)
|
|
127
|
-
return VectorDbResponse(success = False, error = _error)
|
|
128
|
-
|
|
129
|
-
async def kb_stream_file(filename: str):
|
|
130
|
-
file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
|
|
131
|
-
if not os.path.isfile(file_path):
|
|
132
|
-
raise HTTPException(status_code=404, detail="File not found")
|
|
133
|
-
def iter_file():
|
|
134
|
-
with open(file_path, mode="rb") as file:
|
|
135
|
-
while chunk := file.read(1024*8):
|
|
136
|
-
yield chunk
|
|
137
|
-
return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
|
|
1
|
+
import os, gc, shutil, logging, traceback
|
|
2
|
+
import asyncio, aiofiles, aiofiles.os
|
|
3
|
+
from fastapi import HTTPException
|
|
4
|
+
from fastapi.responses import StreamingResponse
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
7
|
+
from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
|
|
8
|
+
from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
|
|
9
|
+
from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
|
|
10
|
+
from ws_bom_robot_app.config import config
|
|
11
|
+
from ws_bom_robot_app.llm.models.kb import load_endpoints
|
|
12
|
+
from ws_bom_robot_app.llm.utils.download import download_files
|
|
13
|
+
|
|
14
|
+
async def _cleanup_directory(directory_path: str):
|
|
15
|
+
if os.path.exists(directory_path):
|
|
16
|
+
await asyncio.to_thread(shutil.rmtree, directory_path)
|
|
17
|
+
|
|
18
|
+
#@timer
|
|
19
|
+
async def rules(rq: RulesRequest) -> VectorDbResponse:
|
|
20
|
+
_config = rq.config()
|
|
21
|
+
db_name = rq.out_name()
|
|
22
|
+
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
23
|
+
try:
|
|
24
|
+
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
|
|
25
|
+
db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
|
|
26
|
+
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
27
|
+
except Exception as e:
|
|
28
|
+
try:
|
|
29
|
+
await _cleanup_directory(store_path)
|
|
30
|
+
finally:
|
|
31
|
+
return VectorDbResponse(success = False, error = str(e))
|
|
32
|
+
finally:
|
|
33
|
+
gc.collect()
|
|
34
|
+
|
|
35
|
+
#@atimer
|
|
36
|
+
async def kb(rq: KbRequest) -> VectorDbResponse:
|
|
37
|
+
os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
|
|
38
|
+
_config = rq.config()
|
|
39
|
+
db_name = rq.out_name()
|
|
40
|
+
src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
|
|
41
|
+
working_path = os.path.join(src_path, db_name)
|
|
42
|
+
|
|
43
|
+
if all([not rq.files,not rq.endpoints,not rq.integrations]):
|
|
44
|
+
return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
|
|
45
|
+
else:
|
|
46
|
+
await aiofiles.os.makedirs(src_path, exist_ok=True)
|
|
47
|
+
await aiofiles.os.makedirs(working_path, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
documents: list[Document] = []
|
|
50
|
+
# Download/copy all files
|
|
51
|
+
if rq.files:
|
|
52
|
+
try:
|
|
53
|
+
loaders = Loader(working_path)
|
|
54
|
+
filter_file_extensions = loaders.managed_file_extensions()
|
|
55
|
+
files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
|
|
56
|
+
if files_to_download:
|
|
57
|
+
await download_files(
|
|
58
|
+
[f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
|
|
59
|
+
src_path, authorization=_config.robot_cms_auth)
|
|
60
|
+
# copy files to working tmp folder
|
|
61
|
+
for file in rq.files:
|
|
62
|
+
async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
|
|
63
|
+
async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
|
|
64
|
+
await dest_file.write(await src_file.read())
|
|
65
|
+
#load files
|
|
66
|
+
try:
|
|
67
|
+
documents.extend(await loaders.load())
|
|
68
|
+
except Exception as e:
|
|
69
|
+
tb = traceback.format_exc()
|
|
70
|
+
_error = f"File loader failure: {e} | {tb}"
|
|
71
|
+
logging.warning(_error)
|
|
72
|
+
return VectorDbResponse(success = False, error = _error)
|
|
73
|
+
except Exception as e:
|
|
74
|
+
await _cleanup_directory(working_path)
|
|
75
|
+
return VectorDbResponse(success = False, error = f"Failed to download file {e}")
|
|
76
|
+
|
|
77
|
+
if rq.endpoints:
|
|
78
|
+
try:
|
|
79
|
+
documents.extend(await load_endpoints(rq.endpoints, working_path))
|
|
80
|
+
except Exception as e:
|
|
81
|
+
await _cleanup_directory(working_path)
|
|
82
|
+
tb = traceback.format_exc()
|
|
83
|
+
_error = f"Endpoint failure: {e} | {tb}"
|
|
84
|
+
logging.warning(_error)
|
|
85
|
+
return VectorDbResponse(success = False, error = _error)
|
|
86
|
+
|
|
87
|
+
if rq.integrations:
|
|
88
|
+
tasks = []
|
|
89
|
+
for integration in rq.integrations:
|
|
90
|
+
tasks.append(
|
|
91
|
+
IntegrationManager
|
|
92
|
+
.get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
|
|
93
|
+
.load()
|
|
94
|
+
)
|
|
95
|
+
try:
|
|
96
|
+
integration_documents = await asyncio.gather(*tasks)
|
|
97
|
+
for docs in integration_documents:
|
|
98
|
+
documents.extend(docs)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
await _cleanup_directory(working_path)
|
|
101
|
+
tb = traceback.format_exc()
|
|
102
|
+
_error = f"Integration failure: {e} | {tb}"
|
|
103
|
+
logging.warning(_error)
|
|
104
|
+
return VectorDbResponse(success=False, error=_error)
|
|
105
|
+
|
|
106
|
+
#cleanup
|
|
107
|
+
await _cleanup_directory(working_path)
|
|
108
|
+
|
|
109
|
+
if documents and len(documents) > 0:
|
|
110
|
+
try:
|
|
111
|
+
store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
|
|
112
|
+
db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
|
|
113
|
+
os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
|
|
114
|
+
"zip",
|
|
115
|
+
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
|
|
116
|
+
)
|
|
117
|
+
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
await _cleanup_directory(store_path)
|
|
120
|
+
return VectorDbResponse(success = False, error = str(e))
|
|
121
|
+
finally:
|
|
122
|
+
del documents
|
|
123
|
+
gc.collect()
|
|
124
|
+
else:
|
|
125
|
+
_error = "No documents found in the knowledgebase folder"
|
|
126
|
+
logging.warning(_error)
|
|
127
|
+
return VectorDbResponse(success = False, error = _error)
|
|
128
|
+
|
|
129
|
+
async def kb_stream_file(filename: str):
|
|
130
|
+
file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
|
|
131
|
+
if not os.path.isfile(file_path):
|
|
132
|
+
raise HTTPException(status_code=404, detail="File not found")
|
|
133
|
+
def iter_file():
|
|
134
|
+
with open(file_path, mode="rb") as file:
|
|
135
|
+
while chunk := file.read(1024*8):
|
|
136
|
+
yield chunk
|
|
137
|
+
return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
|
|
3
|
+
from unstructured_ingest.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceIndexer, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
|
|
4
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
4
5
|
from langchain_core.documents import Document
|
|
5
6
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
|
-
from typing import Optional, Union
|
|
7
|
+
from typing import List, Optional, Union
|
|
7
8
|
from pydantic import BaseModel, Field, AliasChoices
|
|
8
9
|
|
|
9
10
|
class ConfluenceParams(BaseModel):
|
|
@@ -16,6 +17,7 @@ class ConfluenceParams(BaseModel):
|
|
|
16
17
|
password: Confluence password or Cloud API token, if filled, set the access_token to None and vice versa.
|
|
17
18
|
access_token (str): The personal access token for authenticating with Confluence, e.g., 'AT....'
|
|
18
19
|
spaces (list[str]): A list of Confluence spaces to interact with, e.g., ['SPACE1', 'SPACE2'].
|
|
20
|
+
max_num_of_docs_from_each_space (int): The maximum number of documents to fetch from each space. Defaults to 500, with a maximum limit of 5000.
|
|
19
21
|
extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g., ['.pdf', '.docx'].
|
|
20
22
|
"""
|
|
21
23
|
url: str
|
|
@@ -23,6 +25,7 @@ class ConfluenceParams(BaseModel):
|
|
|
23
25
|
password: Optional[str] = None
|
|
24
26
|
access_token: Optional[str] = Field(None, validation_alias=AliasChoices("accessToken","access_token"))
|
|
25
27
|
spaces: list[str] = []
|
|
28
|
+
max_num_of_docs_from_each_space: int = Field(default=500, ge=1, le=5000,validation_alias=AliasChoices("maxNumOfDocsFromEachSpace","max_num_of_docs_from_each_space"))
|
|
26
29
|
extension: list[str] = Field(default=None)
|
|
27
30
|
class Confluence(IntegrationStrategy):
|
|
28
31
|
def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
|
|
@@ -33,7 +36,8 @@ class Confluence(IntegrationStrategy):
|
|
|
33
36
|
return 'confluence'
|
|
34
37
|
def run(self) -> None:
|
|
35
38
|
indexer_config = ConfluenceIndexerConfig(
|
|
36
|
-
spaces=self.__data.spaces
|
|
39
|
+
spaces=self.__data.spaces,
|
|
40
|
+
max_num_of_docs_from_each_space=self.__data.max_num_of_docs_from_each_space
|
|
37
41
|
)
|
|
38
42
|
downloader_config = ConfluenceDownloaderConfig(
|
|
39
43
|
download_dir=self.working_directory
|
|
@@ -43,13 +47,37 @@ class Confluence(IntegrationStrategy):
|
|
|
43
47
|
url=self.__data.url,
|
|
44
48
|
username=self.__data.username
|
|
45
49
|
)
|
|
46
|
-
self.__unstructured_ingest.pipeline(
|
|
50
|
+
pipeline: Pipeline = self.__unstructured_ingest.pipeline(
|
|
47
51
|
indexer_config,
|
|
48
52
|
downloader_config,
|
|
49
53
|
connection_config,
|
|
50
|
-
extension=self.__data.extension
|
|
54
|
+
extension=self.__data.extension
|
|
55
|
+
)
|
|
56
|
+
pipeline.indexer_step.process = CustomConfluenceIndexer(**vars(pipeline.indexer_step.process))
|
|
57
|
+
pipeline.run()
|
|
51
58
|
async def load(self) -> list[Document]:
|
|
52
59
|
await asyncio.to_thread(self.run)
|
|
53
60
|
await asyncio.sleep(1)
|
|
54
61
|
return await Loader(self.working_directory).load()
|
|
55
62
|
|
|
63
|
+
class CustomConfluenceIndexer(ConfluenceIndexer):
|
|
64
|
+
def __init__(self, **kwargs):
|
|
65
|
+
for key, value in kwargs.items():
|
|
66
|
+
try:
|
|
67
|
+
setattr(super(), key, value)
|
|
68
|
+
except AttributeError:
|
|
69
|
+
setattr(self, key, value)
|
|
70
|
+
def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
|
|
71
|
+
with self.connection_config.get_client() as client:
|
|
72
|
+
pages = client.get_all_pages_from_space(
|
|
73
|
+
space=space_key,
|
|
74
|
+
start=0,
|
|
75
|
+
limit=self.index_config.max_num_of_docs_from_each_space, #explicitly limit the number of pages fetched (omitted in unstructured-ingest)
|
|
76
|
+
expand=None,
|
|
77
|
+
content_type="page", # blogpost and comment types not currently supported
|
|
78
|
+
status=None,
|
|
79
|
+
)
|
|
80
|
+
limited_pages = pages[: self.index_config.max_num_of_docs_from_each_space]
|
|
81
|
+
doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in limited_pages]
|
|
82
|
+
return doc_ids
|
|
83
|
+
|
|
@@ -1,143 +1,143 @@
|
|
|
1
|
-
import asyncio, logging, aiohttp
|
|
2
|
-
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
|
|
3
|
-
from langchain_core.documents import Document
|
|
4
|
-
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
5
|
-
from typing import List, Union, Optional
|
|
6
|
-
from pydantic import BaseModel, Field, AliasChoices, field_validator
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
class ShopifyParams(BaseModel):
|
|
11
|
-
"""
|
|
12
|
-
ShopifyParams is a model that defines the parameters required for Shopify integration.
|
|
13
|
-
|
|
14
|
-
Attributes:
|
|
15
|
-
shop_name (str): The shop name for Shopify.
|
|
16
|
-
access_token (str): The access token for Shopify.
|
|
17
|
-
graphql_query (Union[str, dict]): The GraphQL query string or dict for Shopify.
|
|
18
|
-
"""
|
|
19
|
-
shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
|
|
20
|
-
access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
|
|
21
|
-
graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
|
|
22
|
-
|
|
23
|
-
@field_validator('graphql_query')
|
|
24
|
-
@classmethod
|
|
25
|
-
def extract_query_string(cls, v):
|
|
26
|
-
"""Extract the query string from dict format if needed"""
|
|
27
|
-
if isinstance(v, dict) and 'query' in v:
|
|
28
|
-
return v['query']
|
|
29
|
-
return v
|
|
30
|
-
|
|
31
|
-
class Shopify(IntegrationStrategy):
|
|
32
|
-
def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
|
|
33
|
-
super().__init__(knowledgebase_path, data)
|
|
34
|
-
self.__data = ShopifyParams.model_validate(self.data)
|
|
35
|
-
|
|
36
|
-
def working_subdirectory(self) -> str:
|
|
37
|
-
return 'shopify'
|
|
38
|
-
|
|
39
|
-
async def run(self) -> None:
|
|
40
|
-
_data = await self.__get_data()
|
|
41
|
-
json_file_path = os.path.join(self.working_directory, 'shopify_data.json')
|
|
42
|
-
with open(json_file_path, 'w', encoding='utf-8') as f:
|
|
43
|
-
json.dump(_data, f, ensure_ascii=False)
|
|
44
|
-
|
|
45
|
-
async def load(self) -> list[Document]:
|
|
46
|
-
await self.run()
|
|
47
|
-
await asyncio.sleep(1)
|
|
48
|
-
return await Loader(self.working_directory).load()
|
|
49
|
-
|
|
50
|
-
async def __get_data(self, page_size: int = 50) -> List[dict]:
|
|
51
|
-
# URL dell'API
|
|
52
|
-
url = f"https://{self.__data.shop_name}.myshopify.com/admin/api/2024-07/graphql.json"
|
|
53
|
-
|
|
54
|
-
# Headers
|
|
55
|
-
headers = {
|
|
56
|
-
"X-Shopify-Access-Token": self.__data.access_token,
|
|
57
|
-
"Content-Type": "application/json"
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
all_products: List[dict] = []
|
|
61
|
-
has_next_page = True
|
|
62
|
-
cursor = None
|
|
63
|
-
retry_count = 0
|
|
64
|
-
max_retries = 5
|
|
65
|
-
|
|
66
|
-
while has_next_page:
|
|
67
|
-
# Variables per la query
|
|
68
|
-
variables = {
|
|
69
|
-
"first": page_size
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
if cursor:
|
|
73
|
-
variables["after"] = cursor
|
|
74
|
-
|
|
75
|
-
# Payload della richiesta
|
|
76
|
-
payload = {
|
|
77
|
-
"query": self.__data.graphql_query,
|
|
78
|
-
"variables": variables
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
try:
|
|
82
|
-
# Effettua la richiesta
|
|
83
|
-
async with aiohttp.ClientSession() as session:
|
|
84
|
-
async with session.post(url, headers=headers, json=payload) as response:
|
|
85
|
-
# Controlla se la risposta è JSON
|
|
86
|
-
try:
|
|
87
|
-
data = await response.json()
|
|
88
|
-
except aiohttp.ContentTypeError:
|
|
89
|
-
text = await response.text()
|
|
90
|
-
logging.error(f"Non-JSON response received. Status code: {response.status}")
|
|
91
|
-
logging.error(f"Content: {text}")
|
|
92
|
-
raise Exception("Invalid response from API")
|
|
93
|
-
|
|
94
|
-
# Gestione del throttling
|
|
95
|
-
if "errors" in data:
|
|
96
|
-
error = data["errors"][0]
|
|
97
|
-
if error.get("extensions", {}).get("code") == "THROTTLED":
|
|
98
|
-
retry_count += 1
|
|
99
|
-
if retry_count > max_retries:
|
|
100
|
-
raise Exception("Too many throttling attempts. Stopping execution.")
|
|
101
|
-
|
|
102
|
-
# Aspetta un po' più a lungo ad ogni tentativo
|
|
103
|
-
wait_time = 2 ** retry_count # Backoff esponenziale
|
|
104
|
-
print(f"Rate limit reached. Waiting {wait_time} seconds... (Attempt {retry_count}/{max_retries})")
|
|
105
|
-
await asyncio.sleep(wait_time)
|
|
106
|
-
continue
|
|
107
|
-
else:
|
|
108
|
-
raise Exception(f"GraphQL errors: {data['errors']}")
|
|
109
|
-
|
|
110
|
-
# Resetta il contatore dei retry se la richiesta è andata bene
|
|
111
|
-
retry_count = 0
|
|
112
|
-
|
|
113
|
-
# Estrae i dati
|
|
114
|
-
products_data = data["data"]["products"]
|
|
115
|
-
edges = products_data["edges"]
|
|
116
|
-
page_info = products_data["pageInfo"]
|
|
117
|
-
|
|
118
|
-
# Aggiungi i prodotti alla lista
|
|
119
|
-
for edge in edges:
|
|
120
|
-
all_products.append(edge["node"])
|
|
121
|
-
|
|
122
|
-
# Aggiorna il cursore e il flag per la paginazione
|
|
123
|
-
has_next_page = page_info["hasNextPage"]
|
|
124
|
-
cursor = page_info["endCursor"]
|
|
125
|
-
|
|
126
|
-
print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_products)}")
|
|
127
|
-
|
|
128
|
-
# Piccola pausa per evitare di saturare l'API
|
|
129
|
-
await asyncio.sleep(0.1)
|
|
130
|
-
|
|
131
|
-
except aiohttp.ClientError as e:
|
|
132
|
-
logging.error(f"Connection error: {e}")
|
|
133
|
-
retry_count += 1
|
|
134
|
-
if retry_count <= max_retries:
|
|
135
|
-
wait_time = 2 ** retry_count
|
|
136
|
-
logging.warning(f"Retrying in {wait_time} seconds...")
|
|
137
|
-
await asyncio.sleep(wait_time)
|
|
138
|
-
continue
|
|
139
|
-
else:
|
|
140
|
-
raise Exception("Too many network errors. Stopping execution.")
|
|
141
|
-
|
|
142
|
-
logging.info(f"Data retrieval completed! Total products: {len(all_products)}")
|
|
143
|
-
return all_products
|
|
1
|
+
import asyncio, logging, aiohttp
|
|
2
|
+
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
|
|
3
|
+
from langchain_core.documents import Document
|
|
4
|
+
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
5
|
+
from typing import List, Union, Optional
|
|
6
|
+
from pydantic import BaseModel, Field, AliasChoices, field_validator
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
class ShopifyParams(BaseModel):
|
|
11
|
+
"""
|
|
12
|
+
ShopifyParams is a model that defines the parameters required for Shopify integration.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
shop_name (str): The shop name for Shopify.
|
|
16
|
+
access_token (str): The access token for Shopify.
|
|
17
|
+
graphql_query (Union[str, dict]): The GraphQL query string or dict for Shopify.
|
|
18
|
+
"""
|
|
19
|
+
shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
|
|
20
|
+
access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
|
|
21
|
+
graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
|
|
22
|
+
|
|
23
|
+
@field_validator('graphql_query')
|
|
24
|
+
@classmethod
|
|
25
|
+
def extract_query_string(cls, v):
|
|
26
|
+
"""Extract the query string from dict format if needed"""
|
|
27
|
+
if isinstance(v, dict) and 'query' in v:
|
|
28
|
+
return v['query']
|
|
29
|
+
return v
|
|
30
|
+
|
|
31
|
+
class Shopify(IntegrationStrategy):
|
|
32
|
+
def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
|
|
33
|
+
super().__init__(knowledgebase_path, data)
|
|
34
|
+
self.__data = ShopifyParams.model_validate(self.data)
|
|
35
|
+
|
|
36
|
+
def working_subdirectory(self) -> str:
|
|
37
|
+
return 'shopify'
|
|
38
|
+
|
|
39
|
+
async def run(self) -> None:
|
|
40
|
+
_data = await self.__get_data()
|
|
41
|
+
json_file_path = os.path.join(self.working_directory, 'shopify_data.json')
|
|
42
|
+
with open(json_file_path, 'w', encoding='utf-8') as f:
|
|
43
|
+
json.dump(_data, f, ensure_ascii=False)
|
|
44
|
+
|
|
45
|
+
async def load(self) -> list[Document]:
|
|
46
|
+
await self.run()
|
|
47
|
+
await asyncio.sleep(1)
|
|
48
|
+
return await Loader(self.working_directory).load()
|
|
49
|
+
|
|
50
|
+
async def __get_data(self, page_size: int = 50) -> List[dict]:
|
|
51
|
+
# URL dell'API
|
|
52
|
+
url = f"https://{self.__data.shop_name}.myshopify.com/admin/api/2024-07/graphql.json"
|
|
53
|
+
|
|
54
|
+
# Headers
|
|
55
|
+
headers = {
|
|
56
|
+
"X-Shopify-Access-Token": self.__data.access_token,
|
|
57
|
+
"Content-Type": "application/json"
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
all_products: List[dict] = []
|
|
61
|
+
has_next_page = True
|
|
62
|
+
cursor = None
|
|
63
|
+
retry_count = 0
|
|
64
|
+
max_retries = 5
|
|
65
|
+
|
|
66
|
+
while has_next_page:
|
|
67
|
+
# Variables per la query
|
|
68
|
+
variables = {
|
|
69
|
+
"first": page_size
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if cursor:
|
|
73
|
+
variables["after"] = cursor
|
|
74
|
+
|
|
75
|
+
# Payload della richiesta
|
|
76
|
+
payload = {
|
|
77
|
+
"query": self.__data.graphql_query,
|
|
78
|
+
"variables": variables
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
# Effettua la richiesta
|
|
83
|
+
async with aiohttp.ClientSession() as session:
|
|
84
|
+
async with session.post(url, headers=headers, json=payload) as response:
|
|
85
|
+
# Controlla se la risposta è JSON
|
|
86
|
+
try:
|
|
87
|
+
data = await response.json()
|
|
88
|
+
except aiohttp.ContentTypeError:
|
|
89
|
+
text = await response.text()
|
|
90
|
+
logging.error(f"Non-JSON response received. Status code: {response.status}")
|
|
91
|
+
logging.error(f"Content: {text}")
|
|
92
|
+
raise Exception("Invalid response from API")
|
|
93
|
+
|
|
94
|
+
# Gestione del throttling
|
|
95
|
+
if "errors" in data:
|
|
96
|
+
error = data["errors"][0]
|
|
97
|
+
if error.get("extensions", {}).get("code") == "THROTTLED":
|
|
98
|
+
retry_count += 1
|
|
99
|
+
if retry_count > max_retries:
|
|
100
|
+
raise Exception("Too many throttling attempts. Stopping execution.")
|
|
101
|
+
|
|
102
|
+
# Aspetta un po' più a lungo ad ogni tentativo
|
|
103
|
+
wait_time = 2 ** retry_count # Backoff esponenziale
|
|
104
|
+
print(f"Rate limit reached. Waiting {wait_time} seconds... (Attempt {retry_count}/{max_retries})")
|
|
105
|
+
await asyncio.sleep(wait_time)
|
|
106
|
+
continue
|
|
107
|
+
else:
|
|
108
|
+
raise Exception(f"GraphQL errors: {data['errors']}")
|
|
109
|
+
|
|
110
|
+
# Resetta il contatore dei retry se la richiesta è andata bene
|
|
111
|
+
retry_count = 0
|
|
112
|
+
|
|
113
|
+
# Estrae i dati
|
|
114
|
+
products_data = data["data"]["products"]
|
|
115
|
+
edges = products_data["edges"]
|
|
116
|
+
page_info = products_data["pageInfo"]
|
|
117
|
+
|
|
118
|
+
# Aggiungi i prodotti alla lista
|
|
119
|
+
for edge in edges:
|
|
120
|
+
all_products.append(edge["node"])
|
|
121
|
+
|
|
122
|
+
# Aggiorna il cursore e il flag per la paginazione
|
|
123
|
+
has_next_page = page_info["hasNextPage"]
|
|
124
|
+
cursor = page_info["endCursor"]
|
|
125
|
+
|
|
126
|
+
print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_products)}")
|
|
127
|
+
|
|
128
|
+
# Piccola pausa per evitare di saturare l'API
|
|
129
|
+
await asyncio.sleep(0.1)
|
|
130
|
+
|
|
131
|
+
except aiohttp.ClientError as e:
|
|
132
|
+
logging.error(f"Connection error: {e}")
|
|
133
|
+
retry_count += 1
|
|
134
|
+
if retry_count <= max_retries:
|
|
135
|
+
wait_time = 2 ** retry_count
|
|
136
|
+
logging.warning(f"Retrying in {wait_time} seconds...")
|
|
137
|
+
await asyncio.sleep(wait_time)
|
|
138
|
+
continue
|
|
139
|
+
else:
|
|
140
|
+
raise Exception("Too many network errors. Stopping execution.")
|
|
141
|
+
|
|
142
|
+
logging.info(f"Data retrieval completed! Total products: {len(all_products)}")
|
|
143
|
+
return all_products
|
|
@@ -33,6 +33,7 @@ class Sitemap(IntegrationStrategy):
|
|
|
33
33
|
self.__exclude_class: list[str] = self.data.get("excludeClass",[]) # type: ignore
|
|
34
34
|
self.__exclude_id: list[str] = self.data.get("excludeId",[]) # type: ignore
|
|
35
35
|
self.__restrict_to_same_domain: bool = self.data.get("restrictDomain", True) # type: ignore
|
|
36
|
+
self.__header_template = self.data.get("headers", None)
|
|
36
37
|
def working_subdirectory(self) -> str:
|
|
37
38
|
return ""
|
|
38
39
|
def _extract(self, tag: Tag) -> str:
|
|
@@ -81,7 +82,8 @@ class Sitemap(IntegrationStrategy):
|
|
|
81
82
|
filter_urls=self.__filter_urls,
|
|
82
83
|
parsing_function=self._parse,
|
|
83
84
|
is_local=self._is_local(self.__sitemap_url),
|
|
84
|
-
restrict_to_same_domain=self.__restrict_to_same_domain
|
|
85
|
+
restrict_to_same_domain=self.__restrict_to_same_domain,
|
|
86
|
+
header_template=self.__header_template
|
|
85
87
|
)
|
|
86
88
|
_docs = self._output([document async for document in self.alazy_load(_loader)])
|
|
87
89
|
if self._is_local(self.__sitemap_url):
|