ws-bom-robot-app 0.0.25__py3-none-any.whl → 0.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,7 +53,6 @@ class Settings(BaseSettings):
53
53
  pass
54
54
  return 1
55
55
  debug: bool
56
- loader_strategy: str
57
56
  loader_show_progress: bool
58
57
  loader_silent_errors: bool
59
58
  number_of_workers: int = _get_number_of_workers()
@@ -68,11 +67,11 @@ class Settings(BaseSettings):
68
67
  the loader options is usefull to minimizing sytem requirements/dependencies for local development
69
68
  """
70
69
  if self.robot_env == "local":
71
- return self.RuntimeOptions(debug=True,loader_strategy="auto",loader_show_progress=True, loader_silent_errors=True)
70
+ return self.RuntimeOptions(debug=True,loader_show_progress=True, loader_silent_errors=True)
72
71
  elif self.robot_env == "development":
73
- return self.RuntimeOptions(debug=True,loader_strategy="",loader_show_progress=True, loader_silent_errors=False)
72
+ return self.RuntimeOptions(debug=True,loader_show_progress=True, loader_silent_errors=False)
74
73
  else:
75
- return self.RuntimeOptions(debug=False,loader_strategy="",loader_show_progress=False, loader_silent_errors=True)
74
+ return self.RuntimeOptions(debug=False,loader_show_progress=False, loader_silent_errors=True)
76
75
 
77
76
  # global instance
78
77
  config = Settings()
@@ -0,0 +1,15 @@
1
+ from langchain_core.documents import Document
2
+ from langchain_text_splitters import CharacterTextSplitter
3
+
4
+ class DocumentChunker:
5
+ @staticmethod
6
+ def chunk(documents: list[Document]) -> list[Document]:
7
+ text_splitter = CharacterTextSplitter(chunk_size=10_000, chunk_overlap=500)
8
+ chunked_documents = []
9
+ for doc in documents:
10
+ chunks = text_splitter.split_text(doc.page_content)
11
+ for chunk in chunks:
12
+ chunked_documents.append(
13
+ Document(page_content=chunk, metadata=doc.metadata)
14
+ )
15
+ return chunked_documents
@@ -4,17 +4,24 @@ from langchain_core.vectorstores.base import VectorStoreRetriever
4
4
  from langchain_openai import OpenAIEmbeddings
5
5
  from typing import Any
6
6
  import asyncio, gc, logging
7
+ from langchain_text_splitters import CharacterTextSplitter
7
8
  from pydantic import SecretStr
8
9
 
10
+ from ws_bom_robot_app.llm.utils.chunker import DocumentChunker
11
+
9
12
  class FaissHelper():
10
13
  _embedding_model = "text-embedding-3-small"
11
14
  _CACHE: dict[str, FAISS] = {}
15
+
12
16
  @staticmethod
13
17
  #@timer
14
18
  async def create(documents: list[Document], folder_path: str, api_key: SecretStr, return_folder_path:bool = False) -> str | None:
15
19
  try:
16
20
  embeddings = OpenAIEmbeddings(api_key=api_key, model=FaissHelper._embedding_model)
17
- faiss_instance = await asyncio.to_thread(FAISS.from_documents, documents, embeddings)
21
+ faiss_instance = await asyncio.to_thread(
22
+ FAISS.from_documents,
23
+ DocumentChunker.chunk(documents),
24
+ embeddings)
18
25
  await asyncio.to_thread(faiss_instance.save_local, folder_path)
19
26
  del faiss_instance, embeddings
20
27
  gc.collect()
@@ -42,16 +42,6 @@ class Loader():
42
42
  '.htm': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
43
43
  '.html': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
44
44
  ".pdf": LoaderConfig(loader=DoclingLoader),
45
- #'.pdf': LoaderConfig(loader=UnstructuredLoader,kwargs={
46
- # 'strategy':'ocr_only', #https://docs.unstructured.io/open-source/core-functionality/partitioning auto,ocr_only,hi_res
47
- # 'split_pdf_page': False,
48
- # 'chunking_strategy': 'basic',
49
- # 'max_characters': 10_000,
50
- # 'include_page_breaks': True,
51
- # 'include_orig_elements': False}),
52
- #'.png': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
53
- #'.jpg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
54
- #'.jpeg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
55
45
  '.png': LoaderConfig(loader=DoclingLoader),
56
46
  '.jpg': LoaderConfig(loader=DoclingLoader),
57
47
  '.jpeg': LoaderConfig(loader=DoclingLoader),
@@ -85,8 +75,6 @@ class Loader():
85
75
  loader_configs = {}
86
76
  for ext, loader_config in Loader._list.items():
87
77
  if loader_config:
88
- if all([self._runtime_options.loader_strategy != "",loader_config.kwargs and "strategy" in loader_config.kwargs]): # type: ignore
89
- loader_config.kwargs["strategy"] = self._runtime_options.loader_strategy # type: ignore
90
78
  loader_key = (loader_config.loader, tuple(loader_config.kwargs.items())) # type: ignore
91
79
  if loader_key not in loader_configs:
92
80
  loader_configs[loader_key] = {
@@ -27,9 +27,11 @@ class DoclingLoader(BaseLoader):
27
27
  def lazy_load(self) -> Iterator[Document]:
28
28
  for source in self._file_paths:
29
29
  try:
30
- _result = self._converter.convert(os.path.abspath(source),raises_on_error=True)
30
+ _result = self._converter.convert(
31
+ os.path.abspath(source),
32
+ raises_on_error=True)
31
33
  doc = _result.document
32
- text = doc.export_to_markdown()
34
+ text = doc.export_to_markdown(image_placeholder="")
33
35
  yield Document(page_content=text, metadata={"source": source})
34
36
  except Exception as e:
35
37
  logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.25
3
+ Version: 0.0.26
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -37,7 +37,7 @@ Requires-Dist: unstructured-ingest[sharepoint]
37
37
  Requires-Dist: unstructured-ingest[slack]
38
38
  Requires-Dist: html5lib==1.1
39
39
  Requires-Dist: markdownify==0.14.1
40
- Requires-Dist: nebuly==0.3.33
40
+ Requires-Dist: nebuly==0.3.35
41
41
 
42
42
  # 🤖 ws-bom-robot-app
43
43
 
@@ -228,7 +228,7 @@ dockerize app from src
228
228
 
229
229
  ```pwsh
230
230
  docker build -f Dockerfile-src -t ws-bom-robot-app:src .
231
- docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -p 6001:6001 ws-bom-robot-app:src
231
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -v "$(pwd)/tests:/app/tests" -p 6001:6001 ws-bom-robot-app:src
232
232
  ```
233
233
 
234
234
  ### ✈️ publish
@@ -1,6 +1,6 @@
1
1
  ws_bom_robot_app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  ws_bom_robot_app/auth.py,sha256=84nIbmJsMrNs0sxIQGEHbjsjc2P6ZrZZGSn8dkiL6is,895
3
- ws_bom_robot_app/config.py,sha256=V5ZrX_JnzpsN32hTTezTfOvEZYkIQBy0lxIQ9JFHdFE,3170
3
+ ws_bom_robot_app/config.py,sha256=7ZTi5GblQFexYB8zt9oAsNmXh193Yz1vMQYq5A65DcI,3079
4
4
  ws_bom_robot_app/cron_manager.py,sha256=0Yt5AMTPGlXZ_M5ck0SKMX8wvzoPsseEezg_s0Q3HKY,9224
5
5
  ws_bom_robot_app/main.py,sha256=vChP8vfmOCbs51TPUsaaxX8FvoFXuURMkOgmgx0Xi_4,6121
6
6
  ws_bom_robot_app/task_manager.py,sha256=Zedzs2R3O-wNSQOqs4jorgFwPRi-ji_0TN4mGfk-VvE,15958
@@ -25,8 +25,9 @@ ws_bom_robot_app/llm/tools/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
25
25
  ws_bom_robot_app/llm/tools/models/main.py,sha256=LsOJ7vkcSzYLoE1oa3TG0Rs0pr9J5VS_e4li6aDx_fw,260
26
26
  ws_bom_robot_app/llm/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
27
  ws_bom_robot_app/llm/utils/agent_utils.py,sha256=LEfAKQwFrwmIdJL0o54iuGrir9uLcJhkciLL3R6Xqwo,814
28
+ ws_bom_robot_app/llm/utils/chunker.py,sha256=N7570xBYlObneg-fsvDhPAJ-Pv8C8OaYZOBK6q7LmMI,607
28
29
  ws_bom_robot_app/llm/utils/download.py,sha256=iAUxH_NiCpTPtGzhC4hBtxotd2HPFt2MBhttslIxqiI,3194
29
- ws_bom_robot_app/llm/utils/faiss_helper.py,sha256=69juxptz1gidgxVOrqNvJajRl40p5-ugHqyEBDtnSKo,5036
30
+ ws_bom_robot_app/llm/utils/faiss_helper.py,sha256=VikpopCpEzV1lN5JISDabpHcIUkNDACNL52KliB4Hxs,5224
30
31
  ws_bom_robot_app/llm/utils/kb.py,sha256=jja45WCbNI7SGEgqDS99nErlwB5eY8Ga7BMnhdMHZ90,1279
31
32
  ws_bom_robot_app/llm/utils/print.py,sha256=ZonoLPcfM6Cpw4_Ec455LiCovExOwvnIgvw1QORSCBY,799
32
33
  ws_bom_robot_app/llm/utils/webhooks.py,sha256=LAAZqyN6VhV13wu4X-X85TwdDgAV2rNvIwQFIIc0FJM,2114
@@ -48,10 +49,10 @@ ws_bom_robot_app/llm/vector_store/integration/sharepoint.py,sha256=zqqn-6qPrK50P
48
49
  ws_bom_robot_app/llm/vector_store/integration/sitemap.py,sha256=nPbIywp-ZwWbWStvjvYVgHqqejyYFr8eZhBc8ycTuaU,4206
49
50
  ws_bom_robot_app/llm/vector_store/integration/slack.py,sha256=FMjESXm2QetFXI6i8epze7Kbbu22fV8CVaxb71AHnJ8,2572
50
51
  ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
- ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=SWV7T6BcsV8nvnUAHHZ9Q2oFUEnfwM33jpJCry5vbIA,5847
52
- ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=aHHfMf2JsZo0o6jrRDlImY0Oi9NFhVQk8Wg5ePAPa50,1721
52
+ ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=GmicqcmEF965Wnd2ysB4ZR_VOxZ0Ys0aNdubssb3_ZE,4963
53
+ ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=dns_A--Wb-oIGYcbrW5RQ6-ALR5cCmySioo389K0GK0,1775
53
54
  ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=qo9ejRZyKv_k6jnGgXnu1W5uqsMMtgqK_uvPpZQ0p74,833
54
- ws_bom_robot_app-0.0.25.dist-info/METADATA,sha256=TObdL0LhroQrJaqOUTwLEY9gqyk_ct-yDPJzcOWps_w,7478
55
- ws_bom_robot_app-0.0.25.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
56
- ws_bom_robot_app-0.0.25.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
57
- ws_bom_robot_app-0.0.25.dist-info/RECORD,,
55
+ ws_bom_robot_app-0.0.26.dist-info/METADATA,sha256=bu2IDziI3DNb_rLBTWnrRXe7S2QVmmqWzP6NOLbNQFU,7507
56
+ ws_bom_robot_app-0.0.26.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
57
+ ws_bom_robot_app-0.0.26.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
58
+ ws_bom_robot_app-0.0.26.dist-info/RECORD,,