ws-bom-robot-app 0.0.25__py3-none-any.whl → 0.0.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/config.py +3 -4
- ws_bom_robot_app/llm/utils/chunker.py +15 -0
- ws_bom_robot_app/llm/utils/faiss_helper.py +8 -1
- ws_bom_robot_app/llm/vector_store/loader/base.py +0 -12
- ws_bom_robot_app/llm/vector_store/loader/docling.py +4 -2
- {ws_bom_robot_app-0.0.25.dist-info → ws_bom_robot_app-0.0.26.dist-info}/METADATA +3 -3
- {ws_bom_robot_app-0.0.25.dist-info → ws_bom_robot_app-0.0.26.dist-info}/RECORD +9 -8
- {ws_bom_robot_app-0.0.25.dist-info → ws_bom_robot_app-0.0.26.dist-info}/WHEEL +0 -0
- {ws_bom_robot_app-0.0.25.dist-info → ws_bom_robot_app-0.0.26.dist-info}/top_level.txt +0 -0
ws_bom_robot_app/config.py
CHANGED
|
@@ -53,7 +53,6 @@ class Settings(BaseSettings):
|
|
|
53
53
|
pass
|
|
54
54
|
return 1
|
|
55
55
|
debug: bool
|
|
56
|
-
loader_strategy: str
|
|
57
56
|
loader_show_progress: bool
|
|
58
57
|
loader_silent_errors: bool
|
|
59
58
|
number_of_workers: int = _get_number_of_workers()
|
|
@@ -68,11 +67,11 @@ class Settings(BaseSettings):
|
|
|
68
67
|
the loader options is usefull to minimizing sytem requirements/dependencies for local development
|
|
69
68
|
"""
|
|
70
69
|
if self.robot_env == "local":
|
|
71
|
-
return self.RuntimeOptions(debug=True,
|
|
70
|
+
return self.RuntimeOptions(debug=True,loader_show_progress=True, loader_silent_errors=True)
|
|
72
71
|
elif self.robot_env == "development":
|
|
73
|
-
return self.RuntimeOptions(debug=True,
|
|
72
|
+
return self.RuntimeOptions(debug=True,loader_show_progress=True, loader_silent_errors=False)
|
|
74
73
|
else:
|
|
75
|
-
return self.RuntimeOptions(debug=False,
|
|
74
|
+
return self.RuntimeOptions(debug=False,loader_show_progress=False, loader_silent_errors=True)
|
|
76
75
|
|
|
77
76
|
# global instance
|
|
78
77
|
config = Settings()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from langchain_core.documents import Document
|
|
2
|
+
from langchain_text_splitters import CharacterTextSplitter
|
|
3
|
+
|
|
4
|
+
class DocumentChunker:
|
|
5
|
+
@staticmethod
|
|
6
|
+
def chunk(documents: list[Document]) -> list[Document]:
|
|
7
|
+
text_splitter = CharacterTextSplitter(chunk_size=10_000, chunk_overlap=500)
|
|
8
|
+
chunked_documents = []
|
|
9
|
+
for doc in documents:
|
|
10
|
+
chunks = text_splitter.split_text(doc.page_content)
|
|
11
|
+
for chunk in chunks:
|
|
12
|
+
chunked_documents.append(
|
|
13
|
+
Document(page_content=chunk, metadata=doc.metadata)
|
|
14
|
+
)
|
|
15
|
+
return chunked_documents
|
|
@@ -4,17 +4,24 @@ from langchain_core.vectorstores.base import VectorStoreRetriever
|
|
|
4
4
|
from langchain_openai import OpenAIEmbeddings
|
|
5
5
|
from typing import Any
|
|
6
6
|
import asyncio, gc, logging
|
|
7
|
+
from langchain_text_splitters import CharacterTextSplitter
|
|
7
8
|
from pydantic import SecretStr
|
|
8
9
|
|
|
10
|
+
from ws_bom_robot_app.llm.utils.chunker import DocumentChunker
|
|
11
|
+
|
|
9
12
|
class FaissHelper():
|
|
10
13
|
_embedding_model = "text-embedding-3-small"
|
|
11
14
|
_CACHE: dict[str, FAISS] = {}
|
|
15
|
+
|
|
12
16
|
@staticmethod
|
|
13
17
|
#@timer
|
|
14
18
|
async def create(documents: list[Document], folder_path: str, api_key: SecretStr, return_folder_path:bool = False) -> str | None:
|
|
15
19
|
try:
|
|
16
20
|
embeddings = OpenAIEmbeddings(api_key=api_key, model=FaissHelper._embedding_model)
|
|
17
|
-
faiss_instance = await asyncio.to_thread(
|
|
21
|
+
faiss_instance = await asyncio.to_thread(
|
|
22
|
+
FAISS.from_documents,
|
|
23
|
+
DocumentChunker.chunk(documents),
|
|
24
|
+
embeddings)
|
|
18
25
|
await asyncio.to_thread(faiss_instance.save_local, folder_path)
|
|
19
26
|
del faiss_instance, embeddings
|
|
20
27
|
gc.collect()
|
|
@@ -42,16 +42,6 @@ class Loader():
|
|
|
42
42
|
'.htm': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
|
|
43
43
|
'.html': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
|
|
44
44
|
".pdf": LoaderConfig(loader=DoclingLoader),
|
|
45
|
-
#'.pdf': LoaderConfig(loader=UnstructuredLoader,kwargs={
|
|
46
|
-
# 'strategy':'ocr_only', #https://docs.unstructured.io/open-source/core-functionality/partitioning auto,ocr_only,hi_res
|
|
47
|
-
# 'split_pdf_page': False,
|
|
48
|
-
# 'chunking_strategy': 'basic',
|
|
49
|
-
# 'max_characters': 10_000,
|
|
50
|
-
# 'include_page_breaks': True,
|
|
51
|
-
# 'include_orig_elements': False}),
|
|
52
|
-
#'.png': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
|
|
53
|
-
#'.jpg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
|
|
54
|
-
#'.jpeg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
|
|
55
45
|
'.png': LoaderConfig(loader=DoclingLoader),
|
|
56
46
|
'.jpg': LoaderConfig(loader=DoclingLoader),
|
|
57
47
|
'.jpeg': LoaderConfig(loader=DoclingLoader),
|
|
@@ -85,8 +75,6 @@ class Loader():
|
|
|
85
75
|
loader_configs = {}
|
|
86
76
|
for ext, loader_config in Loader._list.items():
|
|
87
77
|
if loader_config:
|
|
88
|
-
if all([self._runtime_options.loader_strategy != "",loader_config.kwargs and "strategy" in loader_config.kwargs]): # type: ignore
|
|
89
|
-
loader_config.kwargs["strategy"] = self._runtime_options.loader_strategy # type: ignore
|
|
90
78
|
loader_key = (loader_config.loader, tuple(loader_config.kwargs.items())) # type: ignore
|
|
91
79
|
if loader_key not in loader_configs:
|
|
92
80
|
loader_configs[loader_key] = {
|
|
@@ -27,9 +27,11 @@ class DoclingLoader(BaseLoader):
|
|
|
27
27
|
def lazy_load(self) -> Iterator[Document]:
|
|
28
28
|
for source in self._file_paths:
|
|
29
29
|
try:
|
|
30
|
-
_result = self._converter.convert(
|
|
30
|
+
_result = self._converter.convert(
|
|
31
|
+
os.path.abspath(source),
|
|
32
|
+
raises_on_error=True)
|
|
31
33
|
doc = _result.document
|
|
32
|
-
text = doc.export_to_markdown()
|
|
34
|
+
text = doc.export_to_markdown(image_placeholder="")
|
|
33
35
|
yield Document(page_content=text, metadata={"source": source})
|
|
34
36
|
except Exception as e:
|
|
35
37
|
logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ws_bom_robot_app
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.26
|
|
4
4
|
Summary: A FastAPI application serving ws bom/robot/llm platform ai.
|
|
5
5
|
Home-page: https://github.com/websolutespa/bom
|
|
6
6
|
Author: Websolute Spa
|
|
@@ -37,7 +37,7 @@ Requires-Dist: unstructured-ingest[sharepoint]
|
|
|
37
37
|
Requires-Dist: unstructured-ingest[slack]
|
|
38
38
|
Requires-Dist: html5lib==1.1
|
|
39
39
|
Requires-Dist: markdownify==0.14.1
|
|
40
|
-
Requires-Dist: nebuly==0.3.
|
|
40
|
+
Requires-Dist: nebuly==0.3.35
|
|
41
41
|
|
|
42
42
|
# 🤖 ws-bom-robot-app
|
|
43
43
|
|
|
@@ -228,7 +228,7 @@ dockerize app from src
|
|
|
228
228
|
|
|
229
229
|
```pwsh
|
|
230
230
|
docker build -f Dockerfile-src -t ws-bom-robot-app:src .
|
|
231
|
-
docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -p 6001:6001 ws-bom-robot-app:src
|
|
231
|
+
docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -v "$(pwd)/tests:/app/tests" -p 6001:6001 ws-bom-robot-app:src
|
|
232
232
|
```
|
|
233
233
|
|
|
234
234
|
### ✈️ publish
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
ws_bom_robot_app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
ws_bom_robot_app/auth.py,sha256=84nIbmJsMrNs0sxIQGEHbjsjc2P6ZrZZGSn8dkiL6is,895
|
|
3
|
-
ws_bom_robot_app/config.py,sha256=
|
|
3
|
+
ws_bom_robot_app/config.py,sha256=7ZTi5GblQFexYB8zt9oAsNmXh193Yz1vMQYq5A65DcI,3079
|
|
4
4
|
ws_bom_robot_app/cron_manager.py,sha256=0Yt5AMTPGlXZ_M5ck0SKMX8wvzoPsseEezg_s0Q3HKY,9224
|
|
5
5
|
ws_bom_robot_app/main.py,sha256=vChP8vfmOCbs51TPUsaaxX8FvoFXuURMkOgmgx0Xi_4,6121
|
|
6
6
|
ws_bom_robot_app/task_manager.py,sha256=Zedzs2R3O-wNSQOqs4jorgFwPRi-ji_0TN4mGfk-VvE,15958
|
|
@@ -25,8 +25,9 @@ ws_bom_robot_app/llm/tools/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
|
|
|
25
25
|
ws_bom_robot_app/llm/tools/models/main.py,sha256=LsOJ7vkcSzYLoE1oa3TG0Rs0pr9J5VS_e4li6aDx_fw,260
|
|
26
26
|
ws_bom_robot_app/llm/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
27
|
ws_bom_robot_app/llm/utils/agent_utils.py,sha256=LEfAKQwFrwmIdJL0o54iuGrir9uLcJhkciLL3R6Xqwo,814
|
|
28
|
+
ws_bom_robot_app/llm/utils/chunker.py,sha256=N7570xBYlObneg-fsvDhPAJ-Pv8C8OaYZOBK6q7LmMI,607
|
|
28
29
|
ws_bom_robot_app/llm/utils/download.py,sha256=iAUxH_NiCpTPtGzhC4hBtxotd2HPFt2MBhttslIxqiI,3194
|
|
29
|
-
ws_bom_robot_app/llm/utils/faiss_helper.py,sha256=
|
|
30
|
+
ws_bom_robot_app/llm/utils/faiss_helper.py,sha256=VikpopCpEzV1lN5JISDabpHcIUkNDACNL52KliB4Hxs,5224
|
|
30
31
|
ws_bom_robot_app/llm/utils/kb.py,sha256=jja45WCbNI7SGEgqDS99nErlwB5eY8Ga7BMnhdMHZ90,1279
|
|
31
32
|
ws_bom_robot_app/llm/utils/print.py,sha256=ZonoLPcfM6Cpw4_Ec455LiCovExOwvnIgvw1QORSCBY,799
|
|
32
33
|
ws_bom_robot_app/llm/utils/webhooks.py,sha256=LAAZqyN6VhV13wu4X-X85TwdDgAV2rNvIwQFIIc0FJM,2114
|
|
@@ -48,10 +49,10 @@ ws_bom_robot_app/llm/vector_store/integration/sharepoint.py,sha256=zqqn-6qPrK50P
|
|
|
48
49
|
ws_bom_robot_app/llm/vector_store/integration/sitemap.py,sha256=nPbIywp-ZwWbWStvjvYVgHqqejyYFr8eZhBc8ycTuaU,4206
|
|
49
50
|
ws_bom_robot_app/llm/vector_store/integration/slack.py,sha256=FMjESXm2QetFXI6i8epze7Kbbu22fV8CVaxb71AHnJ8,2572
|
|
50
51
|
ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
|
-
ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=
|
|
52
|
-
ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=
|
|
52
|
+
ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=GmicqcmEF965Wnd2ysB4ZR_VOxZ0Ys0aNdubssb3_ZE,4963
|
|
53
|
+
ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=dns_A--Wb-oIGYcbrW5RQ6-ALR5cCmySioo389K0GK0,1775
|
|
53
54
|
ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=qo9ejRZyKv_k6jnGgXnu1W5uqsMMtgqK_uvPpZQ0p74,833
|
|
54
|
-
ws_bom_robot_app-0.0.
|
|
55
|
-
ws_bom_robot_app-0.0.
|
|
56
|
-
ws_bom_robot_app-0.0.
|
|
57
|
-
ws_bom_robot_app-0.0.
|
|
55
|
+
ws_bom_robot_app-0.0.26.dist-info/METADATA,sha256=bu2IDziI3DNb_rLBTWnrRXe7S2QVmmqWzP6NOLbNQFU,7507
|
|
56
|
+
ws_bom_robot_app-0.0.26.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
57
|
+
ws_bom_robot_app-0.0.26.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
|
|
58
|
+
ws_bom_robot_app-0.0.26.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|