ws-bom-robot-app 0.0.24__tar.gz → 0.0.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/PKG-INFO +17 -7
  2. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/README.md +12 -1
  3. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/setup.py +1 -1
  4. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/config.py +3 -4
  5. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/main.py +1 -0
  6. ws_bom_robot_app-0.0.26/ws_bom_robot_app/llm/utils/chunker.py +15 -0
  7. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/utils/faiss_helper.py +8 -1
  8. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/generator.py +13 -5
  9. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
  10. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/manager.py +2 -0
  11. ws_bom_robot_app-0.0.26/ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +106 -0
  12. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/loader/base.py +23 -31
  13. ws_bom_robot_app-0.0.26/ws_bom_robot_app/llm/vector_store/loader/docling.py +37 -0
  14. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/requirements.txt +8 -5
  15. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app.egg-info/PKG-INFO +17 -7
  16. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app.egg-info/SOURCES.txt +2 -0
  17. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app.egg-info/requires.txt +4 -5
  18. ws_bom_robot_app-0.0.24/ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +0 -51
  19. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/pyproject.toml +0 -0
  20. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/setup.cfg +0 -0
  21. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/__init__.py +0 -0
  22. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/auth.py +0 -0
  23. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/cron_manager.py +0 -0
  24. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/__init__.py +0 -0
  25. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/agent_description.py +0 -0
  26. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/agent_handler.py +0 -0
  27. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/agent_lcel.py +0 -0
  28. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/api.py +0 -0
  29. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/defaut_prompt.py +0 -0
  30. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/models/__init__.py +0 -0
  31. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/models/api.py +0 -0
  32. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/models/base.py +0 -0
  33. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/models/kb.py +0 -0
  34. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/settings.py +0 -0
  35. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/tools/__init__.py +0 -0
  36. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/tools/models/__init__.py +0 -0
  37. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/tools/models/main.py +0 -0
  38. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/tools/tool_builder.py +0 -0
  39. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/tools/tool_manager.py +0 -0
  40. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/tools/utils.py +0 -0
  41. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/utils/__init__.py +0 -0
  42. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/utils/agent_utils.py +0 -0
  43. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/utils/download.py +0 -0
  44. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/utils/kb.py +0 -0
  45. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/utils/print.py +0 -0
  46. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/utils/webhooks.py +0 -0
  47. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/__init__.py +0 -0
  48. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/__init__.py +0 -0
  49. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/azure.py +0 -0
  50. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/base.py +0 -0
  51. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/confluence.py +0 -0
  52. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/gcs.py +0 -0
  53. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/github.py +0 -0
  54. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/googledrive.py +0 -0
  55. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/jira.py +0 -0
  56. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/s3.py +0 -0
  57. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/sftp.py +0 -0
  58. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/sitemap.py +0 -0
  59. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/integration/slack.py +0 -0
  60. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/loader/__init__.py +0 -0
  61. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/llm/vector_store/loader/json_loader.py +0 -0
  62. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/main.py +0 -0
  63. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/task_manager.py +0 -0
  64. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app/util.py +0 -0
  65. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app.egg-info/dependency_links.txt +0 -0
  66. {ws_bom_robot_app-0.0.24 → ws_bom_robot_app-0.0.26}/ws_bom_robot_app.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.24
3
+ Version: 0.0.26
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -21,10 +21,8 @@ Requires-Dist: langchain-openai==0.2.10
21
21
  Requires-Dist: langchain-community==0.3.8
22
22
  Requires-Dist: langchain-core==0.3.21
23
23
  Requires-Dist: faiss-cpu==1.9.0
24
- Requires-Dist: python-magic==0.4.27
25
- Requires-Dist: opencv-python-headless==4.10.0.84
26
- Requires-Dist: unstructured[all-docs]==0.16.11
27
- Requires-Dist: langchain_unstructured==0.1.5
24
+ Requires-Dist: unstructured==0.16.11
25
+ Requires-Dist: unstructured[image]
28
26
  Requires-Dist: unstructured-ingest==0.3.8
29
27
  Requires-Dist: unstructured-ingest[azure]
30
28
  Requires-Dist: unstructured-ingest[confluence]
@@ -35,10 +33,11 @@ Requires-Dist: unstructured-ingest[google_drive]
35
33
  Requires-Dist: unstructured-ingest[jira]
36
34
  Requires-Dist: unstructured-ingest[s3]
37
35
  Requires-Dist: unstructured-ingest[sftp]
36
+ Requires-Dist: unstructured-ingest[sharepoint]
38
37
  Requires-Dist: unstructured-ingest[slack]
39
38
  Requires-Dist: html5lib==1.1
40
39
  Requires-Dist: markdownify==0.14.1
41
- Requires-Dist: nebuly==0.3.33
40
+ Requires-Dist: nebuly==0.3.35
42
41
 
43
42
  # 🤖 ws-bom-robot-app
44
43
 
@@ -214,11 +213,22 @@ launch debugger
214
213
  streamlit run debugger.py --server.port 6002
215
214
  ```
216
215
 
216
+ dockerize base image
217
+
218
+ ```pwsh
219
+ <# cpu #>
220
+ docker build -f Dockerfile-robot-base-cpu -t ghcr.io/websolutespa/ws-bom-robot-base:cpu .
221
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:cpu
222
+ <# gpu #>
223
+ docker build -f Dockerfile-robot-base-gpu -t ghcr.io/websolutespa/ws-bom-robot-base:gpu .
224
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:gpu
225
+ ```
226
+
217
227
  dockerize app from src
218
228
 
219
229
  ```pwsh
220
230
  docker build -f Dockerfile-src -t ws-bom-robot-app:src .
221
- docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
231
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -v "$(pwd)/tests:/app/tests" -p 6001:6001 ws-bom-robot-app:src
222
232
  ```
223
233
 
224
234
  ### ✈️ publish
@@ -172,11 +172,22 @@ launch debugger
172
172
  streamlit run debugger.py --server.port 6002
173
173
  ```
174
174
 
175
+ dockerize base image
176
+
177
+ ```pwsh
178
+ <# cpu #>
179
+ docker build -f Dockerfile-robot-base-cpu -t ghcr.io/websolutespa/ws-bom-robot-base:cpu .
180
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:cpu
181
+ <# gpu #>
182
+ docker build -f Dockerfile-robot-base-gpu -t ghcr.io/websolutespa/ws-bom-robot-base:gpu .
183
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:gpu
184
+ ```
185
+
175
186
  dockerize app from src
176
187
 
177
188
  ```pwsh
178
189
  docker build -f Dockerfile-src -t ws-bom-robot-app:src .
179
- docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
190
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -v "$(pwd)/tests:/app/tests" -p 6001:6001 ws-bom-robot-app:src
180
191
  ```
181
192
 
182
193
  ### ✈️ publish
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="ws_bom_robot_app",
5
- version="0.0.24",
5
+ version="0.0.26",
6
6
  description="A FastAPI application serving ws bom/robot/llm platform ai.",
7
7
  long_description=open("README.md", encoding='utf-8').read(),
8
8
  long_description_content_type="text/markdown",
@@ -53,7 +53,6 @@ class Settings(BaseSettings):
53
53
  pass
54
54
  return 1
55
55
  debug: bool
56
- loader_strategy: str
57
56
  loader_show_progress: bool
58
57
  loader_silent_errors: bool
59
58
  number_of_workers: int = _get_number_of_workers()
@@ -68,11 +67,11 @@ class Settings(BaseSettings):
68
67
  the loader options is usefull to minimizing sytem requirements/dependencies for local development
69
68
  """
70
69
  if self.robot_env == "local":
71
- return self.RuntimeOptions(debug=True,loader_strategy="auto",loader_show_progress=True, loader_silent_errors=True)
70
+ return self.RuntimeOptions(debug=True,loader_show_progress=True, loader_silent_errors=True)
72
71
  elif self.robot_env == "development":
73
- return self.RuntimeOptions(debug=True,loader_strategy="",loader_show_progress=True, loader_silent_errors=False)
72
+ return self.RuntimeOptions(debug=True,loader_show_progress=True, loader_silent_errors=False)
74
73
  else:
75
- return self.RuntimeOptions(debug=False,loader_strategy="",loader_show_progress=False, loader_silent_errors=True)
74
+ return self.RuntimeOptions(debug=False,loader_show_progress=False, loader_silent_errors=True)
76
75
 
77
76
  # global instance
78
77
  config = Settings()
@@ -25,6 +25,7 @@ async def invoke(rq: InvokeRequest) -> str:
25
25
 
26
26
  async def __stream(rq: StreamRequest,queue: Queue,formatted: bool = True) -> None:
27
27
  await rq.initialize()
28
+ #os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
28
29
  if formatted:
29
30
  agent_handler = AgentHandler(queue,rq.thread_id)
30
31
  else:
@@ -0,0 +1,15 @@
1
+ from langchain_core.documents import Document
2
+ from langchain_text_splitters import CharacterTextSplitter
3
+
4
+ class DocumentChunker:
5
+ @staticmethod
6
+ def chunk(documents: list[Document]) -> list[Document]:
7
+ text_splitter = CharacterTextSplitter(chunk_size=10_000, chunk_overlap=500)
8
+ chunked_documents = []
9
+ for doc in documents:
10
+ chunks = text_splitter.split_text(doc.page_content)
11
+ for chunk in chunks:
12
+ chunked_documents.append(
13
+ Document(page_content=chunk, metadata=doc.metadata)
14
+ )
15
+ return chunked_documents
@@ -4,17 +4,24 @@ from langchain_core.vectorstores.base import VectorStoreRetriever
4
4
  from langchain_openai import OpenAIEmbeddings
5
5
  from typing import Any
6
6
  import asyncio, gc, logging
7
+ from langchain_text_splitters import CharacterTextSplitter
7
8
  from pydantic import SecretStr
8
9
 
10
+ from ws_bom_robot_app.llm.utils.chunker import DocumentChunker
11
+
9
12
  class FaissHelper():
10
13
  _embedding_model = "text-embedding-3-small"
11
14
  _CACHE: dict[str, FAISS] = {}
15
+
12
16
  @staticmethod
13
17
  #@timer
14
18
  async def create(documents: list[Document], folder_path: str, api_key: SecretStr, return_folder_path:bool = False) -> str | None:
15
19
  try:
16
20
  embeddings = OpenAIEmbeddings(api_key=api_key, model=FaissHelper._embedding_model)
17
- faiss_instance = await asyncio.to_thread(FAISS.from_documents, documents, embeddings)
21
+ faiss_instance = await asyncio.to_thread(
22
+ FAISS.from_documents,
23
+ DocumentChunker.chunk(documents),
24
+ embeddings)
18
25
  await asyncio.to_thread(faiss_instance.save_local, folder_path)
19
26
  del faiss_instance, embeddings
20
27
  gc.collect()
@@ -1,4 +1,4 @@
1
- import os, gc, shutil, traceback
1
+ import os, gc, shutil, logging, traceback
2
2
  import asyncio, aiofiles, aiofiles.os
3
3
  from fastapi import HTTPException
4
4
  from fastapi.responses import StreamingResponse
@@ -67,7 +67,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
67
67
  documents.extend(await loaders.load())
68
68
  except Exception as e:
69
69
  tb = traceback.format_exc()
70
- return VectorDbResponse(success = False, error = f"File loader failure: {e} | {tb}")
70
+ _error = f"File loader failure: {e} | {tb}"
71
+ logging.warning(_error)
72
+ return VectorDbResponse(success = False, error = _error)
71
73
  except Exception as e:
72
74
  await _cleanup_directory(working_path)
73
75
  return VectorDbResponse(success = False, error = f"Failed to download file {e}")
@@ -78,7 +80,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
78
80
  except Exception as e:
79
81
  await _cleanup_directory(working_path)
80
82
  tb = traceback.format_exc()
81
- return VectorDbResponse(success = False, error = f"Endpoint failure: {e} | {tb}")
83
+ _error = f"Endpoint failure: {e} | {tb}"
84
+ logging.warning(_error)
85
+ return VectorDbResponse(success = False, error = _error)
82
86
 
83
87
  if rq.integrations:
84
88
  tasks = []
@@ -95,7 +99,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
95
99
  except Exception as e:
96
100
  await _cleanup_directory(working_path)
97
101
  tb = traceback.format_exc()
98
- return VectorDbResponse(success=False, error=f"Integration failure: {e} | {tb}")
102
+ _error = f"Integration failure: {e} | {tb}"
103
+ logging.warning(_error)
104
+ return VectorDbResponse(success=False, error=_error)
99
105
 
100
106
  #cleanup
101
107
  await _cleanup_directory(working_path)
@@ -116,7 +122,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
116
122
  del documents
117
123
  gc.collect()
118
124
  else:
119
- return VectorDbResponse(success = False, error = "No documents found in the knowledgebase folder")
125
+ _error = "No documents found in the knowledgebase folder"
126
+ logging.warning(_error)
127
+ return VectorDbResponse(success = False, error = _error)
120
128
 
121
129
  async def kb_stream_file(filename: str):
122
130
  file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
@@ -12,7 +12,7 @@ class DropboxParams(BaseModel):
12
12
  Attributes:
13
13
  remote_url (str): The URL of the remote Dropbox location, e.g. 'dropbox://demo-directory' or 'dropbox://demo-directory/sub-directory'.
14
14
  token (str): The authentication token for accessing Dropbox.
15
- create app: https://www.dropbox.com/developers, with file.content.read permission, and generate token.
15
+ create app: https://www.dropbox.com/developers, with file.content.read permission, and generate token, or use existing app: https://www.dropbox.com/account/connected_apps / https://www.dropbox.com/developers/apps?_tk=pilot_lp&_ad=topbar4&_camp=myapps
16
16
  recursive (bool, optional): A flag indicating whether to search directories recursively. Defaults to False.
17
17
  extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g. ['.pdf', '.docx'].
18
18
  """
@@ -9,6 +9,7 @@ from ws_bom_robot_app.llm.vector_store.integration.googledrive import GoogleDriv
9
9
  from ws_bom_robot_app.llm.vector_store.integration.jira import Jira
10
10
  from ws_bom_robot_app.llm.vector_store.integration.s3 import S3
11
11
  from ws_bom_robot_app.llm.vector_store.integration.sftp import Sftp
12
+ from ws_bom_robot_app.llm.vector_store.integration.sharepoint import Sharepoint
12
13
  from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
13
14
  from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
14
15
 
@@ -23,6 +24,7 @@ class IntegrationManager:
23
24
  "llmkbjira": Jira,
24
25
  "llmkbs3": S3,
25
26
  "llmkbsftp": Sftp,
27
+ "llmkbsharepoint": Sharepoint,
26
28
  "llmkbsitemap": Sitemap,
27
29
  "llmkbslack": Slack,
28
30
 
@@ -0,0 +1,106 @@
1
+ import asyncio, logging, traceback
2
+ from dataclasses import dataclass
3
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
4
+ from unstructured_ingest.v2.processes.connectors.sharepoint import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
5
+ from langchain_core.documents import Document
6
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
+ from typing import Union, Optional
8
+ from pydantic import BaseModel, Field, AliasChoices
9
+
10
+ class SharepointParams(BaseModel):
11
+ """
12
+ SharepointParams is a Pydantic model that defines the parameters required to connect to a SharePoint site.
13
+
14
+ Attributes:
15
+ client_id (str): The client ID for SharePoint authentication.
16
+ client_secret (str): The client secret for SharePoint authentication.
17
+ site_url (str): The URL of the SharePoint site. i.e. site collection level: https://<tenant>.sharepoint.com/sites/<site-collection-name>, or root site: https://<tenant>.sharepoint.com
18
+ site_path (str, optional): TThe path in the SharePoint site from which to start parsing files, for example "Shared Documents". Defaults to None.
19
+ recursive (bool, optional): Whether to recursively access subdirectories. Defaults to False.
20
+ omit_files (bool, optional): Whether to omit files from the results. Defaults to False.
21
+ omit_pages (bool, optional): Whether to omit pages from the results. Defaults to False.
22
+ omit_lists (bool, optional): Whether to omit lists from the results. Defaults to False.
23
+ extension (list[str], optional): A list of file extensions to include, i.e. [".pdf"] Defaults to None.
24
+ """
25
+ client_id : str = Field(validation_alias=AliasChoices("clientId","client_id"))
26
+ client_secret : str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
27
+ site_url: str = Field(validation_alias=AliasChoices("siteUrl","site_url"))
28
+ site_path: str = Field(default=None,validation_alias=AliasChoices("sitePath","site_path"))
29
+ recursive: bool = Field(default=False)
30
+ omit_files: bool = Field(default=False, validation_alias=AliasChoices("omitFiles","omit_files")),
31
+ omit_pages: bool = Field(default=False, validation_alias=AliasChoices("omitPages","omit_pages")),
32
+ omit_lists: bool = Field(default=False, validation_alias=AliasChoices("omitLists","omit_lists")),
33
+ extension: list[str] = Field(default=None)
34
+ class Sharepoint(IntegrationStrategy):
35
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
36
+ super().__init__(knowledgebase_path, data)
37
+ self.__data = SharepointParams.model_validate(self.data)
38
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
39
+ def working_subdirectory(self) -> str:
40
+ return 'sharepoint'
41
+ def run(self) -> None:
42
+ indexer_config = SharepointIndexerConfig(
43
+ path=self.__data.site_path,
44
+ recursive=self.__data.recursive,
45
+ omit_files=self.__data.omit_files,
46
+ omit_pages=self.__data.omit_pages,
47
+ omit_lists=self.__data.omit_lists
48
+ )
49
+ downloader_config = SharepointDownloaderConfig(
50
+ download_dir=self.working_directory
51
+ )
52
+ connection_config = SharepointConnectionConfig(
53
+ access_config=SharepointAccessConfig(client_cred=self.__data.client_secret),
54
+ client_id=self.__data.client_id,
55
+ site=self.__data.site_url,
56
+ permissions_config=None
57
+ )
58
+ pipeline = self.__unstructured_ingest.pipeline(
59
+ indexer_config,
60
+ downloader_config,
61
+ connection_config,
62
+ extension=self.__data.extension)
63
+ current_indexer_process = pipeline.indexer_step.process
64
+ pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
65
+ pipeline.run()
66
+ async def load(self) -> list[Document]:
67
+ await asyncio.to_thread(self.run)
68
+ await asyncio.sleep(1)
69
+ return await Loader(self.working_directory).load()
70
+
71
+ @dataclass
72
+ class CustomSharepointIndexer(SharepointIndexer):
73
+ def __init__(self, **kwargs):
74
+ # Initialize all attributes from the base indexer
75
+ for key, value in kwargs.items():
76
+ setattr(self, key, value)
77
+ def list_files(self, folder, recursive):
78
+ try:
79
+ _files = super().list_files(folder, recursive)
80
+ return _files
81
+ except Exception as e:
82
+ tb = traceback.format_exc()
83
+ logging.error(f"Error listing sharepoint files: {e} \n {tb}")
84
+ return []
85
+ def file_to_file_data(self, client, file):
86
+ try:
87
+ return super().file_to_file_data(client, file)
88
+ except Exception as e:
89
+ tb = traceback.format_exc()
90
+ logging.error(f"Error converting sharepoint file {file} to data: {e} \n {tb}")
91
+ return None
92
+ def list_pages(self, client):
93
+ try:
94
+ _pages = super().list_pages(client)
95
+ _allowed_content_type = None
96
+ for page in _pages:
97
+ # determine the allowed content type from the first page (Home.aspx)
98
+ if not _allowed_content_type:
99
+ _allowed_content_type = page.content_type_id
100
+ if not page.content_type_id == _allowed_content_type:
101
+ _pages.remove_child(page)
102
+ return _pages
103
+ except Exception as e:
104
+ tb = traceback.format_exc()
105
+ logging.error(f"Error listing sharepoint pages: {e} \n {tb}")
106
+ return []
@@ -1,21 +1,18 @@
1
-
2
1
  import asyncio, gc, logging, os, traceback
3
2
  from typing import Any, Optional
4
3
  from langchain_community.document_loaders import DirectoryLoader
5
4
  from langchain_community.document_loaders.base import BaseLoader
6
5
  from langchain_community.document_loaders.merge import MergedDataLoader
7
6
  from langchain_core.documents import Document
8
- from langchain_unstructured import UnstructuredLoader
9
7
  from pydantic import BaseModel
10
8
  from ws_bom_robot_app.config import config
11
9
  from ws_bom_robot_app.llm.vector_store.loader.json_loader import JsonLoader
10
+ from ws_bom_robot_app.llm.vector_store.loader.docling import DoclingLoader
11
+ from langchain_community.document_loaders import CSVLoader, UnstructuredPowerPointLoader, UnstructuredEmailLoader, UnstructuredXMLLoader, TextLoader, UnstructuredHTMLLoader
12
12
 
13
13
  class LoaderConfig(BaseModel):
14
14
  loader: type[BaseLoader]
15
- kwargs: Optional[dict[str, Any]] = {
16
- 'chunking_strategy': 'basic',
17
- 'max_characters': 10_000
18
- }
15
+ kwargs: Optional[dict[str, Any]] = {}
19
16
  #post_processors: Optional[list[Callable[[str], str]]] = None
20
17
 
21
18
  class Loader():
@@ -25,39 +22,36 @@ class Loader():
25
22
 
26
23
  _list: dict[str, LoaderConfig | None] = {
27
24
  '.json': LoaderConfig(loader=JsonLoader),
28
- '.csv': LoaderConfig(loader=UnstructuredLoader),
29
- '.xls': LoaderConfig(loader=UnstructuredLoader),
30
- '.xlsx': LoaderConfig(loader=UnstructuredLoader),
31
- '.eml': LoaderConfig(loader=UnstructuredLoader),
32
- '.msg': LoaderConfig(loader=UnstructuredLoader),
25
+ '.csv': LoaderConfig(loader=CSVLoader),
26
+ '.xls': None,
27
+ '.xlsx': LoaderConfig(loader=DoclingLoader),
28
+ '.eml': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
29
+ '.msg': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
33
30
  '.epub': None,
34
- '.md': LoaderConfig(loader=UnstructuredLoader),
31
+ '.md': LoaderConfig(loader=TextLoader),
35
32
  '.org': None,
36
33
  '.odt': None,
37
34
  '.ppt': None,
38
- '.pptx': LoaderConfig(loader=UnstructuredLoader),
39
- '.txt': LoaderConfig(loader=UnstructuredLoader),
35
+ '.pptx': LoaderConfig(loader=UnstructuredPowerPointLoader,kwargs={"strategy":"auto"}), #docling issue with WMF https://github.com/DS4SD/docling/issues/594
36
+ '.txt': LoaderConfig(loader=TextLoader),
40
37
  '.rst': None,
41
38
  '.rtf': None,
42
39
  '.tsv': None,
43
40
  '.text': None,
44
41
  '.log': None,
45
- '.htm': LoaderConfig(loader=UnstructuredLoader),
46
- '.html': LoaderConfig(loader=UnstructuredLoader),
47
- '.pdf': LoaderConfig(loader=UnstructuredLoader,kwargs={
48
- 'strategy':'ocr_only', #https://docs.unstructured.io/open-source/core-functionality/partitioning auto,ocr_only,hi_res
49
- 'split_pdf_page': False,
50
- 'chunking_strategy': 'basic',
51
- 'max_characters': 10_000,
52
- 'include_page_breaks': True,
53
- 'include_orig_elements': False}),
54
- '.png': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
55
- '.jpg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
56
- '.jpeg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
42
+ '.htm': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
43
+ '.html': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
44
+ ".pdf": LoaderConfig(loader=DoclingLoader),
45
+ '.png': LoaderConfig(loader=DoclingLoader),
46
+ '.jpg': LoaderConfig(loader=DoclingLoader),
47
+ '.jpeg': LoaderConfig(loader=DoclingLoader),
48
+ '.gif': None,
49
+ ".emf": None,
50
+ ".wmf": None,
57
51
  '.tiff': None,
58
52
  '.doc': None, #see liberoffice dependency
59
- '.docx': LoaderConfig(loader=UnstructuredLoader),
60
- '.xml': LoaderConfig(loader=UnstructuredLoader),
53
+ '.docx': LoaderConfig(loader=DoclingLoader),
54
+ '.xml': LoaderConfig(loader=UnstructuredXMLLoader,kwargs={"strategy":"auto"}),
61
55
  '.js': None,
62
56
  '.py': None,
63
57
  '.c': None,
@@ -81,8 +75,6 @@ class Loader():
81
75
  loader_configs = {}
82
76
  for ext, loader_config in Loader._list.items():
83
77
  if loader_config:
84
- if all([self._runtime_options.loader_strategy != "",loader_config.kwargs,"strategy" in loader_config.kwargs]): # type: ignore
85
- loader_config.kwargs["strategy"] = self._runtime_options.loader_strategy # type: ignore
86
78
  loader_key = (loader_config.loader, tuple(loader_config.kwargs.items())) # type: ignore
87
79
  if loader_key not in loader_configs:
88
80
  loader_configs[loader_key] = {
@@ -122,7 +114,7 @@ class Loader():
122
114
  return _documents
123
115
  except Exception as e:
124
116
  logging.warning(f"Attempt {attempt+1} load document failed: {e}")
125
- await asyncio.sleep(1)
117
+ await asyncio.sleep(2)
126
118
  if attempt == MAX_RETRIES - 1:
127
119
  tb = traceback.format_exc()
128
120
  logging.error(f"Failed to load documents: {e} | {tb}")
@@ -0,0 +1,37 @@
1
+ import os, logging, traceback
2
+ from typing import Iterator, AsyncIterator, Optional
3
+ from langchain_core.document_loaders import BaseLoader
4
+ from langchain_core.documents import Document
5
+ from langchain_core.runnables import run_in_executor
6
+ from docling.document_converter import DocumentConverter, ConversionResult, ConversionStatus
7
+
8
+ class DoclingLoader(BaseLoader):
9
+ def __init__(self, file_path: str | list[str]) -> None:
10
+ self._file_paths = file_path if isinstance(file_path, list) else [file_path]
11
+ self._converter = DocumentConverter()
12
+ def load(self) -> list[Document]:
13
+ """Load data into Document objects."""
14
+ return list(self.lazy_load())
15
+ async def aload(self) -> list[Document]:
16
+ """Load data into Document objects."""
17
+ return [document async for document in self.alazy_load()]
18
+ async def alazy_load(self) -> AsyncIterator[Document]:
19
+ """A lazy loader for Documents."""
20
+ iterator = await run_in_executor(None, self.lazy_load)
21
+ done = object()
22
+ while True:
23
+ doc = await run_in_executor(None, next, iterator, done) # type: ignore[call-arg, arg-type]
24
+ if doc is done:
25
+ break
26
+ yield doc # type: ignore[misc]
27
+ def lazy_load(self) -> Iterator[Document]:
28
+ for source in self._file_paths:
29
+ try:
30
+ _result = self._converter.convert(
31
+ os.path.abspath(source),
32
+ raises_on_error=True)
33
+ doc = _result.document
34
+ text = doc.export_to_markdown(image_placeholder="")
35
+ yield Document(page_content=text, metadata={"source": source})
36
+ except Exception as e:
37
+ logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
@@ -18,10 +18,12 @@ faiss-cpu==1.9.0
18
18
  #langchain_chroma==0.1.4
19
19
 
20
20
  #loaders
21
- python-magic==0.4.27
22
- opencv-python-headless==4.10.0.84 #docker specs
23
- unstructured[all-docs]==0.16.11
24
- langchain_unstructured==0.1.5
21
+ #python-magic==0.4.27
22
+ #opencv-python-headless==4.10.0.84 #docker specs
23
+ unstructured==0.16.11
24
+ unstructured[image]
25
+ #unstructured[all-docs]==0.16.11
26
+ #langchain_unstructured==0.1.5
25
27
  unstructured-ingest==0.3.8
26
28
  unstructured-ingest[azure]
27
29
  unstructured-ingest[confluence]
@@ -32,6 +34,7 @@ unstructured-ingest[google_drive]
32
34
  unstructured-ingest[jira]
33
35
  unstructured-ingest[s3]
34
36
  unstructured-ingest[sftp]
37
+ unstructured-ingest[sharepoint]
35
38
  unstructured-ingest[slack]
36
39
  html5lib==1.1 #beautifulsoup4 parser
37
40
 
@@ -39,4 +42,4 @@ html5lib==1.1 #beautifulsoup4 parser
39
42
  markdownify==0.14.1 #sitemap
40
43
 
41
44
  #telemetry
42
- nebuly==0.3.33
45
+ nebuly==0.3.35
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.24
3
+ Version: 0.0.26
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -21,10 +21,8 @@ Requires-Dist: langchain-openai==0.2.10
21
21
  Requires-Dist: langchain-community==0.3.8
22
22
  Requires-Dist: langchain-core==0.3.21
23
23
  Requires-Dist: faiss-cpu==1.9.0
24
- Requires-Dist: python-magic==0.4.27
25
- Requires-Dist: opencv-python-headless==4.10.0.84
26
- Requires-Dist: unstructured[all-docs]==0.16.11
27
- Requires-Dist: langchain_unstructured==0.1.5
24
+ Requires-Dist: unstructured==0.16.11
25
+ Requires-Dist: unstructured[image]
28
26
  Requires-Dist: unstructured-ingest==0.3.8
29
27
  Requires-Dist: unstructured-ingest[azure]
30
28
  Requires-Dist: unstructured-ingest[confluence]
@@ -35,10 +33,11 @@ Requires-Dist: unstructured-ingest[google_drive]
35
33
  Requires-Dist: unstructured-ingest[jira]
36
34
  Requires-Dist: unstructured-ingest[s3]
37
35
  Requires-Dist: unstructured-ingest[sftp]
36
+ Requires-Dist: unstructured-ingest[sharepoint]
38
37
  Requires-Dist: unstructured-ingest[slack]
39
38
  Requires-Dist: html5lib==1.1
40
39
  Requires-Dist: markdownify==0.14.1
41
- Requires-Dist: nebuly==0.3.33
40
+ Requires-Dist: nebuly==0.3.35
42
41
 
43
42
  # 🤖 ws-bom-robot-app
44
43
 
@@ -214,11 +213,22 @@ launch debugger
214
213
  streamlit run debugger.py --server.port 6002
215
214
  ```
216
215
 
216
+ dockerize base image
217
+
218
+ ```pwsh
219
+ <# cpu #>
220
+ docker build -f Dockerfile-robot-base-cpu -t ghcr.io/websolutespa/ws-bom-robot-base:cpu .
221
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:cpu
222
+ <# gpu #>
223
+ docker build -f Dockerfile-robot-base-gpu -t ghcr.io/websolutespa/ws-bom-robot-base:gpu .
224
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:gpu
225
+ ```
226
+
217
227
  dockerize app from src
218
228
 
219
229
  ```pwsh
220
230
  docker build -f Dockerfile-src -t ws-bom-robot-app:src .
221
- docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
231
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -v "$(pwd)/tests:/app/tests" -p 6001:6001 ws-bom-robot-app:src
222
232
  ```
223
233
 
224
234
  ### ✈️ publish
@@ -34,6 +34,7 @@ ws_bom_robot_app/llm/tools/models/__init__.py
34
34
  ws_bom_robot_app/llm/tools/models/main.py
35
35
  ws_bom_robot_app/llm/utils/__init__.py
36
36
  ws_bom_robot_app/llm/utils/agent_utils.py
37
+ ws_bom_robot_app/llm/utils/chunker.py
37
38
  ws_bom_robot_app/llm/utils/download.py
38
39
  ws_bom_robot_app/llm/utils/faiss_helper.py
39
40
  ws_bom_robot_app/llm/utils/kb.py
@@ -58,4 +59,5 @@ ws_bom_robot_app/llm/vector_store/integration/sitemap.py
58
59
  ws_bom_robot_app/llm/vector_store/integration/slack.py
59
60
  ws_bom_robot_app/llm/vector_store/loader/__init__.py
60
61
  ws_bom_robot_app/llm/vector_store/loader/base.py
62
+ ws_bom_robot_app/llm/vector_store/loader/docling.py
61
63
  ws_bom_robot_app/llm/vector_store/loader/json_loader.py
@@ -9,10 +9,8 @@ langchain-openai==0.2.10
9
9
  langchain-community==0.3.8
10
10
  langchain-core==0.3.21
11
11
  faiss-cpu==1.9.0
12
- python-magic==0.4.27
13
- opencv-python-headless==4.10.0.84
14
- unstructured[all-docs]==0.16.11
15
- langchain_unstructured==0.1.5
12
+ unstructured==0.16.11
13
+ unstructured[image]
16
14
  unstructured-ingest==0.3.8
17
15
  unstructured-ingest[azure]
18
16
  unstructured-ingest[confluence]
@@ -23,7 +21,8 @@ unstructured-ingest[google_drive]
23
21
  unstructured-ingest[jira]
24
22
  unstructured-ingest[s3]
25
23
  unstructured-ingest[sftp]
24
+ unstructured-ingest[sharepoint]
26
25
  unstructured-ingest[slack]
27
26
  html5lib==1.1
28
27
  markdownify==0.14.1
29
- nebuly==0.3.33
28
+ nebuly==0.3.35
@@ -1,51 +0,0 @@
1
- import asyncio
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.sharepoint import SharepointIndexerConfig, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
4
- from langchain_core.documents import Document
5
- from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
- from typing import Union, Optional
7
- from pydantic import BaseModel, Field, AliasChoices
8
-
9
- class SharepointParams(BaseModel):
10
- client_id : str = Field(validation_alias=AliasChoices("clientId","client_id"))
11
- client_secret : str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
12
- site_url: str = Field(validation_alias=AliasChoices("siteUrl","site_url"))
13
- site_path: str = Field(default=None,validation_alias=AliasChoices("sitePath","site_path"))
14
- recursive: bool = Field(default=False)
15
- omit_files: bool = Field(default=False, validation_alias=AliasChoices("omitFiles","omit_files")),
16
- omit_pages: bool = Field(default=False, validation_alias=AliasChoices("omitPages","omit_pages")),
17
- omit_lists: bool = Field(default=False, validation_alias=AliasChoices("omitLists","omit_lists")),
18
- extension: list[str] = Field(default=None)
19
- class Sharepoint(IntegrationStrategy):
20
- def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
21
- super().__init__(knowledgebase_path, data)
22
- self.__data = SharepointParams.model_validate(self.data)
23
- self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
24
- def working_subdirectory(self) -> str:
25
- return 'sharepoint'
26
- def run(self) -> None:
27
- indexer_config = SharepointIndexerConfig(
28
- path=self.__data.site_path,
29
- recursive=self.__data.recursive,
30
- omit_files=self.__data.omit_files,
31
- omit_pages=self.__data.omit_pages,
32
- omit_lists=self.__data.omit_lists
33
- )
34
- downloader_config = SharepointDownloaderConfig(
35
- download_dir=self.working_directory
36
- )
37
- connection_config = SharepointConnectionConfig(
38
- access_config=SharepointAccessConfig(client_cred=self.__data.client_secret),
39
- client_id=self.__data.client_id,
40
- site=self.__data.site_url,
41
- permissions_config=None
42
- )
43
- self.__unstructured_ingest.pipeline(
44
- indexer_config,
45
- downloader_config,
46
- connection_config,
47
- extension=self.__data.extension).run()
48
- async def load(self) -> list[Document]:
49
- await asyncio.to_thread(self.run)
50
- await asyncio.sleep(1)
51
- return await Loader(self.working_directory).load()