ws-bom-robot-app 0.0.33__py3-none-any.whl → 0.0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ws_bom_robot_app/config.py +10 -1
  2. ws_bom_robot_app/llm/agent_description.py +123 -124
  3. ws_bom_robot_app/llm/agent_handler.py +180 -167
  4. ws_bom_robot_app/llm/agent_lcel.py +54 -64
  5. ws_bom_robot_app/llm/api.py +33 -21
  6. ws_bom_robot_app/llm/defaut_prompt.py +15 -9
  7. ws_bom_robot_app/llm/main.py +109 -102
  8. ws_bom_robot_app/llm/models/api.py +55 -7
  9. ws_bom_robot_app/llm/models/kb.py +11 -2
  10. ws_bom_robot_app/llm/providers/__init__.py +0 -0
  11. ws_bom_robot_app/llm/providers/llm_manager.py +174 -0
  12. ws_bom_robot_app/llm/settings.py +4 -4
  13. ws_bom_robot_app/llm/tools/models/main.py +5 -3
  14. ws_bom_robot_app/llm/tools/tool_builder.py +23 -19
  15. ws_bom_robot_app/llm/tools/tool_manager.py +133 -101
  16. ws_bom_robot_app/llm/tools/utils.py +25 -25
  17. ws_bom_robot_app/llm/utils/agent_utils.py +17 -16
  18. ws_bom_robot_app/llm/utils/download.py +79 -79
  19. ws_bom_robot_app/llm/utils/print.py +29 -29
  20. ws_bom_robot_app/llm/utils/secrets.py +26 -0
  21. ws_bom_robot_app/llm/vector_store/generator.py +137 -137
  22. ws_bom_robot_app/llm/vector_store/integration/base.py +12 -1
  23. ws_bom_robot_app/llm/vector_store/loader/base.py +6 -5
  24. ws_bom_robot_app/llm/vector_store/loader/docling.py +27 -6
  25. ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
  26. ws_bom_robot_app/main.py +7 -2
  27. {ws_bom_robot_app-0.0.33.dist-info → ws_bom_robot_app-0.0.35.dist-info}/METADATA +25 -12
  28. {ws_bom_robot_app-0.0.33.dist-info → ws_bom_robot_app-0.0.35.dist-info}/RECORD +30 -28
  29. ws_bom_robot_app/llm/utils/faiss_helper.py +0 -127
  30. {ws_bom_robot_app-0.0.33.dist-info → ws_bom_robot_app-0.0.35.dist-info}/WHEEL +0 -0
  31. {ws_bom_robot_app-0.0.33.dist-info → ws_bom_robot_app-0.0.35.dist-info}/top_level.txt +0 -0
@@ -1,79 +1,79 @@
1
- from typing import List,Optional
2
- import os, logging, aiohttp, asyncio
3
- from tqdm.asyncio import tqdm
4
-
5
- async def download_files(urls: List[str], destination_folder: str, authorization: str = None):
6
- tasks = [download_file(file, os.path.join(destination_folder, os.path.basename(file)), authorization=authorization) for file in urls]
7
- results = await asyncio.gather(*tasks, return_exceptions=False)
8
- for i, result in enumerate(results):
9
- if not result:
10
- raise Exception(f"Download failed for file: {urls[i]}")
11
-
12
- async def download_file(url: str, destination: str, chunk_size: int = 8192, authorization: str = None) -> Optional[str]:
13
- """
14
- Downloads a file from a given URL to a destination path asynchronously.
15
-
16
- Args:
17
- url: The URL of the file to download
18
- destination: The local path where the file should be saved
19
- chunk_size: Size of chunks to download (default: 8192 bytes)
20
-
21
- Returns:
22
- str: Path to the downloaded file if successful, None otherwise
23
-
24
- Raises:
25
- Various exceptions are caught and logged
26
- """
27
- try:
28
- # Ensure the destination directory exists
29
- os.makedirs(os.path.dirname(os.path.abspath(destination)), exist_ok=True)
30
-
31
- async with aiohttp.ClientSession() as session:
32
- if authorization:
33
- headers = {'Authorization': authorization}
34
- session.headers.update(headers)
35
- async with session.get(url) as response:
36
- # Check if the request was successful
37
- if response.status != 200:
38
- logging.error(f"Failed to download file. Status code: {response.status}")
39
- return None
40
-
41
- # Get the total file size if available
42
- total_size = int(response.headers.get('content-length', 0))
43
- # Open the destination file and write chunks
44
- with open(destination, 'wb') as f:
45
- with tqdm(
46
- total=total_size,
47
- desc="Downloading",
48
- unit='B',
49
- unit_scale=True,
50
- unit_divisor=1024
51
- ) as pbar:
52
- async for chunk in response.content.iter_chunked(chunk_size):
53
- if chunk:
54
- f.write(chunk)
55
- pbar.update(len(chunk))
56
-
57
- logging.info(f"File downloaded successfully to {destination}")
58
- return destination
59
-
60
- except aiohttp.ClientError as e:
61
- logging.error(f"Network error occurred: {str(e)}")
62
- return None
63
- except asyncio.TimeoutError:
64
- logging.error("Download timed out")
65
- return None
66
- except IOError as e:
67
- logging.error(f"IO error occurred: {str(e)}")
68
- return None
69
- except Exception as e:
70
- logging.error(f"Unexpected error occurred: {str(e)}")
71
- return None
72
- finally:
73
- # If download failed and file was partially created, clean it up
74
- if os.path.exists(destination) and os.path.getsize(destination) == 0:
75
- try:
76
- os.remove(destination)
77
- logging.info(f"Cleaned up incomplete download: {destination}")
78
- except OSError:
79
- pass
1
+ from typing import List,Optional
2
+ import os, logging, aiohttp, asyncio
3
+ from tqdm.asyncio import tqdm
4
+
5
+ async def download_files(urls: List[str], destination_folder: str, authorization: str = None):
6
+ tasks = [download_file(file, os.path.join(destination_folder, os.path.basename(file)), authorization=authorization) for file in urls]
7
+ results = await asyncio.gather(*tasks, return_exceptions=False)
8
+ for i, result in enumerate(results):
9
+ if not result:
10
+ raise Exception(f"Download failed for file: {urls[i]}")
11
+
12
+ async def download_file(url: str, destination: str, chunk_size: int = 8192, authorization: str = None) -> Optional[str]:
13
+ """
14
+ Downloads a file from a given URL to a destination path asynchronously.
15
+
16
+ Args:
17
+ url: The URL of the file to download
18
+ destination: The local path where the file should be saved
19
+ chunk_size: Size of chunks to download (default: 8192 bytes)
20
+
21
+ Returns:
22
+ str: Path to the downloaded file if successful, None otherwise
23
+
24
+ Raises:
25
+ Various exceptions are caught and logged
26
+ """
27
+ try:
28
+ # Ensure the destination directory exists
29
+ os.makedirs(os.path.dirname(os.path.abspath(destination)), exist_ok=True)
30
+
31
+ async with aiohttp.ClientSession() as session:
32
+ if authorization:
33
+ headers = {'Authorization': authorization}
34
+ session.headers.update(headers)
35
+ async with session.get(url) as response:
36
+ # Check if the request was successful
37
+ if response.status != 200:
38
+ logging.error(f"Failed to download file. Status code: {response.status}")
39
+ return None
40
+
41
+ # Get the total file size if available
42
+ total_size = int(response.headers.get('content-length', 0))
43
+ # Open the destination file and write chunks
44
+ with open(destination, 'wb') as f:
45
+ with tqdm(
46
+ total=total_size,
47
+ desc="Downloading",
48
+ unit='B',
49
+ unit_scale=True,
50
+ unit_divisor=1024
51
+ ) as pbar:
52
+ async for chunk in response.content.iter_chunked(chunk_size):
53
+ if chunk:
54
+ f.write(chunk)
55
+ pbar.update(len(chunk))
56
+
57
+ logging.info(f"File downloaded successfully to {destination}")
58
+ return destination
59
+
60
+ except aiohttp.ClientError as e:
61
+ logging.error(f"Network error occurred: {str(e)}")
62
+ return None
63
+ except asyncio.TimeoutError:
64
+ logging.error("Download timed out")
65
+ return None
66
+ except IOError as e:
67
+ logging.error(f"IO error occurred: {str(e)}")
68
+ return None
69
+ except Exception as e:
70
+ logging.error(f"Unexpected error occurred: {str(e)}")
71
+ return None
72
+ finally:
73
+ # If download failed and file was partially created, clean it up
74
+ if os.path.exists(destination) and os.path.getsize(destination) == 0:
75
+ try:
76
+ os.remove(destination)
77
+ logging.info(f"Cleaned up incomplete download: {destination}")
78
+ except OSError:
79
+ pass
@@ -1,29 +1,29 @@
1
- import os, sys, json
2
-
3
- class HiddenPrints:
4
- def __enter__(self):
5
- self._original_stdout = sys.stdout
6
- self._original_stderr = sys.stderr
7
-
8
- sys.stdout = open(os.devnull, 'w')
9
- sys.stderr = open(os.devnull, 'w')
10
-
11
- def __exit__(self, exc_type, exc_val, exc_tb):
12
- sys.stdout.close()
13
- sys.stderr.close()
14
- sys.stdout = self._original_stdout
15
- sys.stderr = self._original_stderr
16
-
17
- def printJson(data) -> str:
18
- return f"{json.dumps(data, indent=2, sort_keys=True)},"
19
-
20
- def printSingleJson(data) -> str:
21
- return f"{json.dumps(data, indent=2, sort_keys=True)}"
22
-
23
- def printString(data: str) -> str:
24
- if data != "":
25
- return printJson(data)
26
-
27
- def printSingleString(data: str) -> str:
28
- if data != "":
29
- return printSingleJson(data)
1
+ import os, sys, json
2
+
3
+ class HiddenPrints:
4
+ def __enter__(self):
5
+ self._original_stdout = sys.stdout
6
+ self._original_stderr = sys.stderr
7
+
8
+ sys.stdout = open(os.devnull, 'w')
9
+ sys.stderr = open(os.devnull, 'w')
10
+
11
+ def __exit__(self, exc_type, exc_val, exc_tb):
12
+ sys.stdout.close()
13
+ sys.stderr.close()
14
+ sys.stdout = self._original_stdout
15
+ sys.stderr = self._original_stderr
16
+
17
+ def printJson(data) -> str:
18
+ return f"{json.dumps(data, indent=2, sort_keys=True)},"
19
+
20
+ def printSingleJson(data) -> str:
21
+ return f"{json.dumps(data, indent=2, sort_keys=True)}"
22
+
23
+ def printString(data: str) -> str:
24
+ if data != "":
25
+ return printJson(data)
26
+
27
+ def printSingleString(data: str) -> str:
28
+ if data != "":
29
+ return printSingleJson(data)
@@ -0,0 +1,26 @@
1
+ import json, os, logging
2
+ from typing import Union
3
+ class Secrets:
4
+ @staticmethod
5
+ def from_file(path: str) -> dict[str, Union[str, int, list]] | None:
6
+ if os.path.exists(path):
7
+ with open(path, 'r') as file:
8
+ _content = file.read()
9
+ try:
10
+ return json.loads(_content)
11
+ except json.JSONDecodeError:
12
+ logging.error(f"Failed to parse secret file: {path}")
13
+ else:
14
+ logging.error(f"Secret file not found: {path}")
15
+ return None
16
+ @staticmethod
17
+ def from_env(key: str) -> dict[str, Union[str, int, list]] | None:
18
+ _content = os.getenv(key)
19
+ if _content:
20
+ try:
21
+ return json.loads(_content)
22
+ except json.JSONDecodeError:
23
+ logging.error(f"Failed to parse environment variable: {key}")
24
+ else:
25
+ logging.error(f"Environment variable not found: {key}")
26
+ return None
@@ -1,137 +1,137 @@
1
- import os, gc, shutil, logging, traceback
2
- import asyncio, aiofiles, aiofiles.os
3
- from fastapi import HTTPException
4
- from fastapi.responses import StreamingResponse
5
- from langchain_core.documents import Document
6
- from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
- from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
8
- from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
9
- from ws_bom_robot_app.llm.utils.faiss_helper import FaissHelper
10
- from ws_bom_robot_app.config import config
11
- from ws_bom_robot_app.llm.models.kb import load_endpoints
12
- from ws_bom_robot_app.llm.utils.download import download_files
13
-
14
- async def _cleanup_directory(directory_path: str):
15
- if os.path.exists(directory_path):
16
- await asyncio.to_thread(shutil.rmtree, directory_path)
17
-
18
- #@timer
19
- async def rules(rq: RulesRequest) -> VectorDbResponse:
20
- api_key = rq.api_key()
21
- _config = rq.config()
22
- db_name = rq.out_name()
23
- store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
24
- try:
25
- await FaissHelper.create([Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path, api_key) #type: ignore
26
- db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
27
- return VectorDbResponse(file = os.path.basename(db_file_path))
28
- except Exception as e:
29
- await _cleanup_directory(store_path)
30
- return VectorDbResponse(success = False, error = str(e))
31
- finally:
32
- gc.collect()
33
-
34
- #@atimer
35
- async def kb(rq: KbRequest) -> VectorDbResponse:
36
- os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
37
- api_key = rq.api_key()
38
- _config = rq.config()
39
- db_name = rq.out_name()
40
- src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
41
- working_path = os.path.join(src_path, db_name)
42
-
43
- if all([not rq.files,not rq.endpoints,not rq.integrations]):
44
- return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
45
- else:
46
- await aiofiles.os.makedirs(src_path, exist_ok=True)
47
- await aiofiles.os.makedirs(working_path, exist_ok=True)
48
-
49
- documents: list[Document] = []
50
- # Download/copy all files
51
- if rq.files:
52
- try:
53
- loaders = Loader(working_path)
54
- filter_file_extensions = loaders.managed_file_extensions()
55
- files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
56
- if files_to_download:
57
- await download_files(
58
- [f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
59
- src_path, authorization=_config.robot_cms_auth)
60
- # copy files to working tmp folder
61
- for file in rq.files:
62
- async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
63
- async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
64
- await dest_file.write(await src_file.read())
65
- #load files
66
- try:
67
- documents.extend(await loaders.load())
68
- except Exception as e:
69
- tb = traceback.format_exc()
70
- _error = f"File loader failure: {e} | {tb}"
71
- logging.warning(_error)
72
- return VectorDbResponse(success = False, error = _error)
73
- except Exception as e:
74
- await _cleanup_directory(working_path)
75
- return VectorDbResponse(success = False, error = f"Failed to download file {e}")
76
-
77
- if rq.endpoints:
78
- try:
79
- documents.extend(await load_endpoints(rq.endpoints, working_path))
80
- except Exception as e:
81
- await _cleanup_directory(working_path)
82
- tb = traceback.format_exc()
83
- _error = f"Endpoint failure: {e} | {tb}"
84
- logging.warning(_error)
85
- return VectorDbResponse(success = False, error = _error)
86
-
87
- if rq.integrations:
88
- tasks = []
89
- for integration in rq.integrations:
90
- tasks.append(
91
- IntegrationManager
92
- .get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
93
- .load()
94
- )
95
- try:
96
- integration_documents = await asyncio.gather(*tasks)
97
- for docs in integration_documents:
98
- documents.extend(docs)
99
- except Exception as e:
100
- await _cleanup_directory(working_path)
101
- tb = traceback.format_exc()
102
- _error = f"Integration failure: {e} | {tb}"
103
- logging.warning(_error)
104
- return VectorDbResponse(success=False, error=_error)
105
-
106
- #cleanup
107
- await _cleanup_directory(working_path)
108
-
109
- if documents and len(documents) > 0:
110
- try:
111
- store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
112
- db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
113
- os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
114
- "zip",
115
- await FaissHelper.create(documents, store_path, api_key, return_folder_path=True)
116
- )
117
- return VectorDbResponse(file = os.path.basename(db_file_path))
118
- except Exception as e:
119
- await _cleanup_directory(store_path)
120
- return VectorDbResponse(success = False, error = str(e))
121
- finally:
122
- del documents
123
- gc.collect()
124
- else:
125
- _error = "No documents found in the knowledgebase folder"
126
- logging.warning(_error)
127
- return VectorDbResponse(success = False, error = _error)
128
-
129
- async def kb_stream_file(filename: str):
130
- file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
131
- if not os.path.isfile(file_path):
132
- raise HTTPException(status_code=404, detail="File not found")
133
- def iter_file():
134
- with open(file_path, mode="rb") as file:
135
- while chunk := file.read(1024*8):
136
- yield chunk
137
- return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
1
+ import os, gc, shutil, logging, traceback
2
+ import asyncio, aiofiles, aiofiles.os
3
+ from fastapi import HTTPException
4
+ from fastapi.responses import StreamingResponse
5
+ from langchain_core.documents import Document
6
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
+ from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
8
+ from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
9
+ from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
10
+ from ws_bom_robot_app.config import config
11
+ from ws_bom_robot_app.llm.models.kb import load_endpoints
12
+ from ws_bom_robot_app.llm.utils.download import download_files
13
+
14
+ async def _cleanup_directory(directory_path: str):
15
+ if os.path.exists(directory_path):
16
+ await asyncio.to_thread(shutil.rmtree, directory_path)
17
+
18
+ #@timer
19
+ async def rules(rq: RulesRequest) -> VectorDbResponse:
20
+ _config = rq.config()
21
+ db_name = rq.out_name()
22
+ store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
23
+ try:
24
+ await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
25
+ db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
26
+ return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
27
+ except Exception as e:
28
+ try:
29
+ await _cleanup_directory(store_path)
30
+ finally:
31
+ return VectorDbResponse(success = False, error = str(e))
32
+ finally:
33
+ gc.collect()
34
+
35
+ #@atimer
36
+ async def kb(rq: KbRequest) -> VectorDbResponse:
37
+ os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
38
+ _config = rq.config()
39
+ db_name = rq.out_name()
40
+ src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
41
+ working_path = os.path.join(src_path, db_name)
42
+
43
+ if all([not rq.files,not rq.endpoints,not rq.integrations]):
44
+ return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
45
+ else:
46
+ await aiofiles.os.makedirs(src_path, exist_ok=True)
47
+ await aiofiles.os.makedirs(working_path, exist_ok=True)
48
+
49
+ documents: list[Document] = []
50
+ # Download/copy all files
51
+ if rq.files:
52
+ try:
53
+ loaders = Loader(working_path)
54
+ filter_file_extensions = loaders.managed_file_extensions()
55
+ files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
56
+ if files_to_download:
57
+ await download_files(
58
+ [f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
59
+ src_path, authorization=_config.robot_cms_auth)
60
+ # copy files to working tmp folder
61
+ for file in rq.files:
62
+ async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
63
+ async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
64
+ await dest_file.write(await src_file.read())
65
+ #load files
66
+ try:
67
+ documents.extend(await loaders.load())
68
+ except Exception as e:
69
+ tb = traceback.format_exc()
70
+ _error = f"File loader failure: {e} | {tb}"
71
+ logging.warning(_error)
72
+ return VectorDbResponse(success = False, error = _error)
73
+ except Exception as e:
74
+ await _cleanup_directory(working_path)
75
+ return VectorDbResponse(success = False, error = f"Failed to download file {e}")
76
+
77
+ if rq.endpoints:
78
+ try:
79
+ documents.extend(await load_endpoints(rq.endpoints, working_path))
80
+ except Exception as e:
81
+ await _cleanup_directory(working_path)
82
+ tb = traceback.format_exc()
83
+ _error = f"Endpoint failure: {e} | {tb}"
84
+ logging.warning(_error)
85
+ return VectorDbResponse(success = False, error = _error)
86
+
87
+ if rq.integrations:
88
+ tasks = []
89
+ for integration in rq.integrations:
90
+ tasks.append(
91
+ IntegrationManager
92
+ .get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
93
+ .load()
94
+ )
95
+ try:
96
+ integration_documents = await asyncio.gather(*tasks)
97
+ for docs in integration_documents:
98
+ documents.extend(docs)
99
+ except Exception as e:
100
+ await _cleanup_directory(working_path)
101
+ tb = traceback.format_exc()
102
+ _error = f"Integration failure: {e} | {tb}"
103
+ logging.warning(_error)
104
+ return VectorDbResponse(success=False, error=_error)
105
+
106
+ #cleanup
107
+ await _cleanup_directory(working_path)
108
+
109
+ if documents and len(documents) > 0:
110
+ try:
111
+ store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
112
+ db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
113
+ os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
114
+ "zip",
115
+ await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
116
+ )
117
+ return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
118
+ except Exception as e:
119
+ await _cleanup_directory(store_path)
120
+ return VectorDbResponse(success = False, error = str(e))
121
+ finally:
122
+ del documents
123
+ gc.collect()
124
+ else:
125
+ _error = "No documents found in the knowledgebase folder"
126
+ logging.warning(_error)
127
+ return VectorDbResponse(success = False, error = _error)
128
+
129
+ async def kb_stream_file(filename: str):
130
+ file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
131
+ if not os.path.isfile(file_path):
132
+ raise HTTPException(status_code=404, detail="File not found")
133
+ def iter_file():
134
+ with open(file_path, mode="rb") as file:
135
+ while chunk := file.read(1024*8):
136
+ yield chunk
137
+ return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
@@ -4,11 +4,22 @@ from abc import ABC, abstractmethod
4
4
  from unstructured_ingest.v2.interfaces import ProcessorConfig
5
5
  from unstructured_ingest.v2.pipeline.pipeline import Pipeline, PartitionerConfig, FiltererConfig
6
6
  from typing import Union
7
+ from ws_bom_robot_app.llm.utils.secrets import Secrets
7
8
 
8
9
  class IntegrationStrategy(ABC):
10
+ @classmethod
11
+ def _parse_data(cls, data: dict[str, Union[str, int, list]]) -> dict[str, Union[str, int, list]]:
12
+ for key, fn in (
13
+ ("__from_env", Secrets.from_env),
14
+ ("__from_file", Secrets.from_file),
15
+ ):
16
+ if key in data:
17
+ if secret := fn(data[key]):
18
+ return secret
19
+ return data
9
20
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
10
21
  self.knowledgebase_path = knowledgebase_path
11
- self.data = data
22
+ self.data = self._parse_data(data)
12
23
  self.working_directory = os.path.join(self.knowledgebase_path,self.working_subdirectory())
13
24
  os.makedirs(self.working_directory, exist_ok=True)
14
25
  @property
@@ -13,6 +13,7 @@ from langchain_community.document_loaders import (
13
13
  CSVLoader,
14
14
  UnstructuredEmailLoader,
15
15
  UnstructuredImageLoader,
16
+ UnstructuredWordDocumentLoader,
16
17
  UnstructuredXMLLoader,
17
18
  UnstructuredPowerPointLoader,
18
19
  TextLoader
@@ -39,7 +40,7 @@ class Loader():
39
40
  '.org': None,
40
41
  '.odt': None,
41
42
  '.ppt': None,
42
- '.pptx': LoaderConfig(loader=UnstructuredPowerPointLoader,kwargs={"strategy":"auto"}), #docling issue with WMF https://github.com/DS4SD/docling/issues/594
43
+ '.pptx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredPowerPointLoader, "strategy":"auto"}),
43
44
  '.txt': LoaderConfig(loader=TextLoader, kwargs={"autodetect_encoding": True}),
44
45
  '.rst': None,
45
46
  '.rtf': None,
@@ -49,15 +50,15 @@ class Loader():
49
50
  '.htm': LoaderConfig(loader=BSHTMLLoader),
50
51
  '.html': LoaderConfig(loader=BSHTMLLoader),
51
52
  ".pdf": LoaderConfig(loader=DoclingLoader),
52
- '.png': LoaderConfig(loader=UnstructuredImageLoader,kwargs={"strategy":"auto","mode":"single"}),
53
- '.jpg': LoaderConfig(loader=UnstructuredImageLoader,kwargs={"strategy":"auto","mode":"single"}),
54
- '.jpeg': LoaderConfig(loader=UnstructuredImageLoader,kwargs={"strategy":"auto","mode":"single"}),
53
+ '.png': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
54
+ '.jpg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
55
+ '.jpeg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
55
56
  '.gif': None,
56
57
  ".emf": None,
57
58
  ".wmf": None,
58
59
  '.tiff': None,
59
60
  '.doc': None, #see liberoffice dependency
60
- '.docx': LoaderConfig(loader=DoclingLoader),
61
+ '.docx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredWordDocumentLoader, "strategy":"auto"}),
61
62
  '.xml': LoaderConfig(loader=UnstructuredXMLLoader,kwargs={"strategy":"auto"}),
62
63
  '.js': None,
63
64
  '.py': None,
@@ -1,20 +1,27 @@
1
1
  import os, logging, traceback
2
- from typing import Iterator, AsyncIterator, Optional
2
+ from typing import Any, Iterator, AsyncIterator, Optional, Union
3
3
  from langchain_core.document_loaders import BaseLoader
4
4
  from langchain_core.documents import Document
5
5
  from langchain_core.runnables import run_in_executor
6
- from docling.document_converter import DocumentConverter, InputFormat, PdfFormatOption
7
- from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode
6
+ from docling.document_converter import DocumentConverter, InputFormat, PdfFormatOption, ImageFormatOption
7
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode, TesseractCliOcrOptions
8
+ from langchain_community.document_loaders import UnstructuredFileLoader
8
9
 
9
10
  class DoclingLoader(BaseLoader):
10
- def __init__(self, file_path: str | list[str]) -> None:
11
+ def __init__(self, file_path: str | list[str], **kwargs: Any) -> None:
11
12
  self._file_paths = file_path if isinstance(file_path, list) else [file_path]
12
13
  self._converter = DocumentConverter(format_options={
13
14
  InputFormat.PDF: PdfFormatOption(
14
15
  pipeline_options=PdfPipelineOptions(
15
16
  table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
16
- ))
17
+ )),
18
+ InputFormat.IMAGE: ImageFormatOption(
19
+ pipeline_options=PdfPipelineOptions(
20
+ ocr_options=TesseractCliOcrOptions(lang=["auto"]),
21
+ table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
22
+ ))
17
23
  })
24
+ self._kwargs = kwargs
18
25
  def load(self) -> list[Document]:
19
26
  """Load data into Document objects."""
20
27
  return list(self.lazy_load())
@@ -40,4 +47,18 @@ class DoclingLoader(BaseLoader):
40
47
  text = doc.export_to_markdown(image_placeholder="")
41
48
  yield Document(page_content=text, metadata={"source": source})
42
49
  except Exception as e:
43
- logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
50
+ if 'fallback' in self._kwargs:
51
+ if issubclass(self._kwargs['fallback'], (BaseLoader, UnstructuredFileLoader)):
52
+ logging.info(f"Using fallback loader {self._kwargs['fallback']} for {source}")
53
+ try:
54
+ loader: Union[BaseLoader, UnstructuredFileLoader] = self._kwargs['fallback'](
55
+ source,
56
+ **{k: v for k, v in self._kwargs.items() if k != 'fallback'}
57
+ )
58
+ yield from loader.lazy_load()
59
+ except Exception as e:
60
+ logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
61
+ else:
62
+ logging.warning(f"Invalid fallback loader {self._kwargs['fallback']}[{type(self._kwargs['fallback'])}] for {source}")
63
+ else:
64
+ logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")