ws-bom-robot-app 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ from asyncio import Queue
1
2
  from langchain_core.agents import AgentFinish
2
3
  from langchain_core.outputs import ChatGenerationChunk, GenerationChunk
3
4
  from langchain.callbacks.base import AsyncCallbackHandler
@@ -16,13 +17,13 @@ import json
16
17
 
17
18
  class AgentHandler(AsyncCallbackHandler):
18
19
 
19
- def __init__(self, threadId) -> None:
20
+ def __init__(self, queue: Queue, threadId: str = None) -> None:
20
21
  super().__init__()
21
22
  self._threadId = threadId
22
- self.json_cards = None
23
23
  self.json_block = ""
24
24
  self.is_json_block = False
25
25
  self.backtick_count = 0 # Conteggio dei backticks per il controllo accurato
26
+ self.queue = queue
26
27
 
27
28
  async def on_llm_start(
28
29
  self,
@@ -39,7 +40,7 @@ class AgentHandler(AsyncCallbackHandler):
39
40
  "type": "info",
40
41
  "threadId": self._threadId,
41
42
  }
42
- printJson(firstChunk)
43
+ await self.queue.put(printString(firstChunk))
43
44
 
44
45
  """async def on_chat_model_start(self, serialized: Dict[str, Any], messages: List[List[BaseMessage]], *, run_id: UUID = None, parent_run_id = None, tags = None, metadata = None, **kwargs: Any) -> Any:
45
46
  pass"""
@@ -75,7 +76,7 @@ class AgentHandler(AsyncCallbackHandler):
75
76
  elif self.is_json_block:
76
77
  self.json_block += token
77
78
  else:
78
- printString(token)
79
+ await self.queue.put(printString(token))
79
80
  pass
80
81
 
81
82
  async def on_agent_finish(
@@ -92,12 +93,9 @@ class AgentHandler(AsyncCallbackHandler):
92
93
  AIMessage(content=finish.return_values["output"]),
93
94
  ]
94
95
  )
95
- if self.json_cards:
96
- for card in self.json_cards:
97
- printJson(card)
98
- self.json_cards = None
99
96
  finalChunk = {"type": "end"}
100
- printJson(finalChunk)
97
+ await self.queue.put(printJson(finalChunk))
98
+ await self.queue.put(None)
101
99
 
102
100
  async def process_json_block(self, json_block: str):
103
101
  """Processa il blocco JSON completo."""
@@ -108,15 +106,16 @@ class AgentHandler(AsyncCallbackHandler):
108
106
  try:
109
107
  # Prova a fare il parsing del JSON
110
108
  parsed_json = json.loads(json_block_clean)
111
- printJson(parsed_json)
109
+ await self.queue.put(printJson(parsed_json))
112
110
  except json.JSONDecodeError as e:
113
111
  # Se il JSON è malformato, logga l'errore
114
112
  raise e
115
113
 
116
114
  class RawAgentHandler(AsyncCallbackHandler):
117
115
 
118
- def __init__(self) -> None:
116
+ def __init__(self,queue: Queue) -> None:
119
117
  super().__init__()
118
+ self.queue = queue
120
119
 
121
120
  async def on_llm_start(
122
121
  self,
@@ -147,10 +146,9 @@ class RawAgentHandler(AsyncCallbackHandler):
147
146
  tags: Optional[List[str]] = None,
148
147
  **kwargs: Any,
149
148
  ) -> None:
150
- """Gestisce i nuovi token durante lo streaming."""
151
- if token != "":
152
- print(token)
153
- pass
149
+ """Handles new tokens during streaming."""
150
+ if token: # Only process non-empty tokens
151
+ await self.queue.put(token)
154
152
 
155
153
  async def on_agent_finish(
156
154
  self,
@@ -166,3 +164,4 @@ class RawAgentHandler(AsyncCallbackHandler):
166
164
  AIMessage(content=finish.return_values["output"]),
167
165
  ]
168
166
  )
167
+ await self.queue.put(None)
@@ -25,18 +25,10 @@ async def _invoke(rq: InvokeRequest):
25
25
  async def _stream(rq: StreamRequest) -> StreamingResponse:
26
26
  return StreamingResponse(stream(rq), media_type="application/json")
27
27
 
28
- @router.post("/stream/none")
29
- async def _stream_none(rq: StreamRequest) -> None:
30
- await stream_none(rq)
31
-
32
28
  @router.post("/stream/raw")
33
29
  async def _stream_raw(rq: StreamRequest) -> StreamingResponse:
34
30
  return StreamingResponse(stream(rq, formatted=False), media_type="application/json")
35
31
 
36
- @router.post("/stream/raw/none")
37
- async def _stream_raw_none(rq: StreamRequest) -> None:
38
- await stream_none(rq, formatted=False)
39
-
40
32
  @router.post("/kb")
41
33
  async def _kb(rq: KbRequest) -> VectorDbResponse:
42
34
  return await kb(rq)
@@ -10,6 +10,7 @@ from nebuly.providers.langchain import LangChainTrackingHandler
10
10
  from langchain_core.callbacks.base import AsyncCallbackHandler
11
11
  import warnings, asyncio, os, io, sys, json
12
12
  from typing import List
13
+ from asyncio import Queue
13
14
 
14
15
  async def invoke(rq: InvokeRequest) -> str:
15
16
  await rq.initialize()
@@ -22,12 +23,13 @@ async def invoke(rq: InvokeRequest) -> str:
22
23
  result: AIMessage = await processor.run_agent(_msg)
23
24
  return {"result": result.content}
24
25
 
25
- async def __stream(rq: StreamRequest,formatted: bool = True) -> None:
26
+ async def __stream(rq: StreamRequest,queue: Queue,formatted: bool = True) -> None:
26
27
  await rq.initialize()
28
+ #os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
27
29
  if formatted:
28
- agent_handler = AgentHandler(rq.thread_id)
30
+ agent_handler = AgentHandler(queue,rq.thread_id)
29
31
  else:
30
- agent_handler = RawAgentHandler()
32
+ agent_handler = RawAgentHandler(queue)
31
33
  os.environ["AGENT_HANDLER_FORMATTED"] = str(formatted)
32
34
  callbacks: List[AsyncCallbackHandler] = [agent_handler]
33
35
  settings.init()
@@ -53,7 +55,7 @@ async def __stream(rq: StreamRequest,formatted: bool = True) -> None:
53
55
  processor = AgentLcel(
54
56
  openai_config={"api_key": rq.secrets["openAIApiKey"], "openai_model": rq.model, "temperature": rq.temperature},
55
57
  sys_message=rq.system_message,
56
- tools=get_structured_tools(tools=rq.app_tools, api_key=rq.secrets["openAIApiKey"], callbacks=[callbacks[0]]),
58
+ tools=get_structured_tools(tools=rq.app_tools, api_key=rq.secrets["openAIApiKey"], callbacks=[callbacks[0]], queue=queue),
57
59
  rules=rq.rules
58
60
  )
59
61
 
@@ -71,25 +73,20 @@ async def __stream(rq: StreamRequest,formatted: bool = True) -> None:
71
73
  {"callbacks": callbacks},
72
74
  )
73
75
 
76
+ # Signal the end of streaming
77
+ await queue.put(None)
78
+
74
79
  async def stream(rq: StreamRequest,formatted:bool = True) -> AsyncGenerator[str, None]:
75
- sys_stdout = io.StringIO()
76
- original_stdout = sys.stdout
77
- sys.stdout = sys_stdout
80
+ queue = Queue()
81
+ task = asyncio.create_task(__stream(rq, queue, formatted))
78
82
  try:
79
- task = asyncio.create_task(__stream(rq,formatted))
80
- while not task.done():
81
- await asyncio.sleep(0.1) # yield control to avoid blocking
82
- output = sys_stdout.getvalue()
83
- if output:
84
- yield output
85
- sys_stdout.truncate(0)
86
- sys_stdout.seek(0)
87
- # capture any remaining output after the task completes
88
- output = sys_stdout.getvalue()
89
- if output:
90
- yield output
83
+ while True:
84
+ token = await queue.get()
85
+ if token is None: # None indicates the end of streaming
86
+ break
87
+ yield token
91
88
  finally:
92
- sys.stdout = original_stdout
89
+ await task
93
90
 
94
91
  async def stream_none(rq: StreamRequest, formatted: bool = True) -> None:
95
92
  await __stream(rq, formatted)
@@ -1,12 +1,13 @@
1
+ from asyncio import Queue
1
2
  from langchain.tools import StructuredTool
2
3
  from ws_bom_robot_app.llm.models.api import LlmAppTool
3
4
  from ws_bom_robot_app.llm.tools.tool_manager import ToolManager
4
5
 
5
- def get_structured_tools(tools: list[LlmAppTool], api_key:str, callbacks:list) -> list[StructuredTool]:
6
+ def get_structured_tools(tools: list[LlmAppTool], api_key:str, callbacks:list, queue: Queue) -> list[StructuredTool]:
6
7
  _structured_tools :list[StructuredTool] = []
7
8
  for tool in [tool for tool in tools if tool.is_active]:
8
9
  if _tool_config := ToolManager._list.get(tool.function_name):
9
- _tool_instance = ToolManager(tool, api_key, callbacks)
10
+ _tool_instance = ToolManager(tool, api_key, callbacks, queue)
10
11
  _structured_tool = StructuredTool.from_function(
11
12
  coroutine=_tool_instance.get_coroutine(),
12
13
  name=tool.function_id,
@@ -1,3 +1,4 @@
1
+ from asyncio import Queue
1
2
  from typing import Optional, Type, Callable
2
3
  from ws_bom_robot_app.llm.models.api import LlmAppTool
3
4
  from ws_bom_robot_app.llm.utils.faiss_helper import FaissHelper
@@ -33,10 +34,12 @@ class ToolManager:
33
34
  app_tool: LlmAppTool,
34
35
  api_key: str,
35
36
  callbacks: list,
37
+ queue: Optional[Queue] = None
36
38
  ):
37
39
  self.app_tool = app_tool
38
40
  self.api_key = api_key
39
41
  self.callbacks = callbacks
42
+ self.queue = queue
40
43
 
41
44
 
42
45
  #region functions
@@ -64,7 +67,8 @@ class ToolManager:
64
67
  else:
65
68
  search_type = "mixed"
66
69
  search_kwargs = {"k": search_settings.search_k if search_settings.search_k else 4}
67
- getRandomWaitingMessage(self.app_tool.waiting_message, traduction=False)
70
+ if self.queue:
71
+ await self.queue.put(getRandomWaitingMessage(self.app_tool.waiting_message, traduction=False))
68
72
  return await FaissHelper.invoke(self.app_tool.vector_db, self.api_key, query, search_type, search_kwargs)
69
73
  return []
70
74
  #raise ValueError(f"Invalid configuration for {self.settings.name} tool of type {self.settings.type}. Must be a function or vector db not found.")
@@ -3,8 +3,8 @@ from langchain_openai import ChatOpenAI
3
3
  from langchain_core.prompts import PromptTemplate
4
4
  from ws_bom_robot_app.llm.utils.print import printString
5
5
 
6
- def __print_output(data: str) -> None:
7
- printString(data) if os.environ.get("AGENT_HANDLER_FORMATTED") == str(True) else print(f"{data} ")
6
+ def __print_output(data: str) -> str:
7
+ return printString(data) if os.environ.get("AGENT_HANDLER_FORMATTED") == str(True) else f"{data} "
8
8
 
9
9
  def getRandomWaitingMessage(waiting_messages: str, traduction: bool = True) -> str:
10
10
  if not waiting_messages: return ""
@@ -12,13 +12,12 @@ def getRandomWaitingMessage(waiting_messages: str, traduction: bool = True) -> s
12
12
  if not messages: return ""
13
13
  chosen_message = random.choice(messages) + "\n"
14
14
  if not traduction:
15
- __print_output(chosen_message)
15
+ return __print_output(chosen_message)
16
16
  return chosen_message
17
17
 
18
18
  async def translate_text(api_key, language, text: str, callbacks: list) -> str:
19
19
  if language == "it":
20
- __print_output(text)
21
- return
20
+ return __print_output(text)
22
21
  llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo-0125", streaming=True)
23
22
  sys_message = """Il tuo compito è di tradurre il testo_da_tradure nella seguente lingua: \n\n lingua: {language}\n\n testo_da_tradure: {testo_da_tradure} \n\nTraduci il testo_da_tradure nella lingua {language} senza aggiungere altro:"""
24
23
  prompt = PromptTemplate.from_template(sys_message)
@@ -14,16 +14,16 @@ class HiddenPrints:
14
14
  sys.stdout = self._original_stdout
15
15
  sys.stderr = self._original_stderr
16
16
 
17
- def printJson(data) -> None:
18
- print(json.dumps(data, indent=2, sort_keys=True), end=",", flush=True)
17
+ def printJson(data) -> str:
18
+ return f"{json.dumps(data, indent=2, sort_keys=True)},"
19
19
 
20
20
  def printSingleJson(data) -> str:
21
- print(json.dumps(data, indent=2, sort_keys=True), end="", flush=True)
21
+ return f"{json.dumps(data, indent=2, sort_keys=True)}"
22
22
 
23
- def printString(data: str) -> None:
23
+ def printString(data: str) -> str:
24
24
  if data != "":
25
- printJson(data)
25
+ return printJson(data)
26
26
 
27
- def printSingleString(data: str) -> None:
27
+ def printSingleString(data: str) -> str:
28
28
  if data != "":
29
- printSingleJson(data)
29
+ return printSingleJson(data)
@@ -1,4 +1,4 @@
1
- import os, gc, shutil, traceback
1
+ import os, gc, shutil, logging, traceback
2
2
  import asyncio, aiofiles, aiofiles.os
3
3
  from fastapi import HTTPException
4
4
  from fastapi.responses import StreamingResponse
@@ -67,7 +67,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
67
67
  documents.extend(await loaders.load())
68
68
  except Exception as e:
69
69
  tb = traceback.format_exc()
70
- return VectorDbResponse(success = False, error = f"File loader failure: {e} | {tb}")
70
+ _error = f"File loader failure: {e} | {tb}"
71
+ logging.warning(_error)
72
+ return VectorDbResponse(success = False, error = _error)
71
73
  except Exception as e:
72
74
  await _cleanup_directory(working_path)
73
75
  return VectorDbResponse(success = False, error = f"Failed to download file {e}")
@@ -78,7 +80,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
78
80
  except Exception as e:
79
81
  await _cleanup_directory(working_path)
80
82
  tb = traceback.format_exc()
81
- return VectorDbResponse(success = False, error = f"Endpoint failure: {e} | {tb}")
83
+ _error = f"Endpoint failure: {e} | {tb}"
84
+ logging.warning(_error)
85
+ return VectorDbResponse(success = False, error = _error)
82
86
 
83
87
  if rq.integrations:
84
88
  tasks = []
@@ -95,7 +99,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
95
99
  except Exception as e:
96
100
  await _cleanup_directory(working_path)
97
101
  tb = traceback.format_exc()
98
- return VectorDbResponse(success=False, error=f"Integration failure: {e} | {tb}")
102
+ _error = f"Integration failure: {e} | {tb}"
103
+ logging.warning(_error)
104
+ return VectorDbResponse(success=False, error=_error)
99
105
 
100
106
  #cleanup
101
107
  await _cleanup_directory(working_path)
@@ -116,7 +122,9 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
116
122
  del documents
117
123
  gc.collect()
118
124
  else:
119
- return VectorDbResponse(success = False, error = "No documents found in the knowledgebase folder")
125
+ _error = "No documents found in the knowledgebase folder"
126
+ logging.warning(_error)
127
+ return VectorDbResponse(success = False, error = _error)
120
128
 
121
129
  async def kb_stream_file(filename: str):
122
130
  file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
@@ -12,7 +12,7 @@ class DropboxParams(BaseModel):
12
12
  Attributes:
13
13
  remote_url (str): The URL of the remote Dropbox location, e.g. 'dropbox://demo-directory' or 'dropbox://demo-directory/sub-directory'.
14
14
  token (str): The authentication token for accessing Dropbox.
15
- create app: https://www.dropbox.com/developers, with file.content.read permission, and generate token.
15
+ create app: https://www.dropbox.com/developers, with file.content.read permission, and generate token, or use existing app: https://www.dropbox.com/account/connected_apps / https://www.dropbox.com/developers/apps?_tk=pilot_lp&_ad=topbar4&_camp=myapps
16
16
  recursive (bool, optional): A flag indicating whether to search directories recursively. Defaults to False.
17
17
  extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g. ['.pdf', '.docx'].
18
18
  """
@@ -9,6 +9,7 @@ from ws_bom_robot_app.llm.vector_store.integration.googledrive import GoogleDriv
9
9
  from ws_bom_robot_app.llm.vector_store.integration.jira import Jira
10
10
  from ws_bom_robot_app.llm.vector_store.integration.s3 import S3
11
11
  from ws_bom_robot_app.llm.vector_store.integration.sftp import Sftp
12
+ from ws_bom_robot_app.llm.vector_store.integration.sharepoint import Sharepoint
12
13
  from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
13
14
  from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
14
15
 
@@ -23,6 +24,7 @@ class IntegrationManager:
23
24
  "llmkbjira": Jira,
24
25
  "llmkbs3": S3,
25
26
  "llmkbsftp": Sftp,
27
+ "llmkbsharepoint": Sharepoint,
26
28
  "llmkbsitemap": Sitemap,
27
29
  "llmkbslack": Slack,
28
30
 
@@ -0,0 +1,106 @@
1
+ import asyncio, logging, traceback
2
+ from dataclasses import dataclass
3
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
4
+ from unstructured_ingest.v2.processes.connectors.sharepoint import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
5
+ from langchain_core.documents import Document
6
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
+ from typing import Union, Optional
8
+ from pydantic import BaseModel, Field, AliasChoices
9
+
10
+ class SharepointParams(BaseModel):
11
+ """
12
+ SharepointParams is a Pydantic model that defines the parameters required to connect to a SharePoint site.
13
+
14
+ Attributes:
15
+ client_id (str): The client ID for SharePoint authentication.
16
+ client_secret (str): The client secret for SharePoint authentication.
17
+ site_url (str): The URL of the SharePoint site. i.e. site collection level: https://<tenant>.sharepoint.com/sites/<site-collection-name>, or root site: https://<tenant>.sharepoint.com
18
+ site_path (str, optional): TThe path in the SharePoint site from which to start parsing files, for example "Shared Documents". Defaults to None.
19
+ recursive (bool, optional): Whether to recursively access subdirectories. Defaults to False.
20
+ omit_files (bool, optional): Whether to omit files from the results. Defaults to False.
21
+ omit_pages (bool, optional): Whether to omit pages from the results. Defaults to False.
22
+ omit_lists (bool, optional): Whether to omit lists from the results. Defaults to False.
23
+ extension (list[str], optional): A list of file extensions to include, i.e. [".pdf"] Defaults to None.
24
+ """
25
+ client_id : str = Field(validation_alias=AliasChoices("clientId","client_id"))
26
+ client_secret : str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
27
+ site_url: str = Field(validation_alias=AliasChoices("siteUrl","site_url"))
28
+ site_path: str = Field(default=None,validation_alias=AliasChoices("sitePath","site_path"))
29
+ recursive: bool = Field(default=False)
30
+ omit_files: bool = Field(default=False, validation_alias=AliasChoices("omitFiles","omit_files")),
31
+ omit_pages: bool = Field(default=False, validation_alias=AliasChoices("omitPages","omit_pages")),
32
+ omit_lists: bool = Field(default=False, validation_alias=AliasChoices("omitLists","omit_lists")),
33
+ extension: list[str] = Field(default=None)
34
+ class Sharepoint(IntegrationStrategy):
35
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
36
+ super().__init__(knowledgebase_path, data)
37
+ self.__data = SharepointParams.model_validate(self.data)
38
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
39
+ def working_subdirectory(self) -> str:
40
+ return 'sharepoint'
41
+ def run(self) -> None:
42
+ indexer_config = SharepointIndexerConfig(
43
+ path=self.__data.site_path,
44
+ recursive=self.__data.recursive,
45
+ omit_files=self.__data.omit_files,
46
+ omit_pages=self.__data.omit_pages,
47
+ omit_lists=self.__data.omit_lists
48
+ )
49
+ downloader_config = SharepointDownloaderConfig(
50
+ download_dir=self.working_directory
51
+ )
52
+ connection_config = SharepointConnectionConfig(
53
+ access_config=SharepointAccessConfig(client_cred=self.__data.client_secret),
54
+ client_id=self.__data.client_id,
55
+ site=self.__data.site_url,
56
+ permissions_config=None
57
+ )
58
+ pipeline = self.__unstructured_ingest.pipeline(
59
+ indexer_config,
60
+ downloader_config,
61
+ connection_config,
62
+ extension=self.__data.extension)
63
+ current_indexer_process = pipeline.indexer_step.process
64
+ pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
65
+ pipeline.run()
66
+ async def load(self) -> list[Document]:
67
+ await asyncio.to_thread(self.run)
68
+ await asyncio.sleep(1)
69
+ return await Loader(self.working_directory).load()
70
+
71
+ @dataclass
72
+ class CustomSharepointIndexer(SharepointIndexer):
73
+ def __init__(self, **kwargs):
74
+ # Initialize all attributes from the base indexer
75
+ for key, value in kwargs.items():
76
+ setattr(self, key, value)
77
+ def list_files(self, folder, recursive):
78
+ try:
79
+ _files = super().list_files(folder, recursive)
80
+ return _files
81
+ except Exception as e:
82
+ tb = traceback.format_exc()
83
+ logging.error(f"Error listing sharepoint files: {e} \n {tb}")
84
+ return []
85
+ def file_to_file_data(self, client, file):
86
+ try:
87
+ return super().file_to_file_data(client, file)
88
+ except Exception as e:
89
+ tb = traceback.format_exc()
90
+ logging.error(f"Error converting sharepoint file {file} to data: {e} \n {tb}")
91
+ return None
92
+ def list_pages(self, client):
93
+ try:
94
+ _pages = super().list_pages(client)
95
+ _allowed_content_type = None
96
+ for page in _pages:
97
+ # determine the allowed content type from the first page (Home.aspx)
98
+ if not _allowed_content_type:
99
+ _allowed_content_type = page.content_type_id
100
+ if not page.content_type_id == _allowed_content_type:
101
+ _pages.remove_child(page)
102
+ return _pages
103
+ except Exception as e:
104
+ tb = traceback.format_exc()
105
+ logging.error(f"Error listing sharepoint pages: {e} \n {tb}")
106
+ return []
@@ -1,14 +1,14 @@
1
-
2
1
  import asyncio, gc, logging, os, traceback
3
2
  from typing import Any, Optional
4
3
  from langchain_community.document_loaders import DirectoryLoader
5
4
  from langchain_community.document_loaders.base import BaseLoader
6
5
  from langchain_community.document_loaders.merge import MergedDataLoader
7
6
  from langchain_core.documents import Document
8
- from langchain_unstructured import UnstructuredLoader
9
7
  from pydantic import BaseModel
10
8
  from ws_bom_robot_app.config import config
11
9
  from ws_bom_robot_app.llm.vector_store.loader.json_loader import JsonLoader
10
+ from ws_bom_robot_app.llm.vector_store.loader.docling import DoclingLoader
11
+ from langchain_community.document_loaders import CSVLoader, UnstructuredPowerPointLoader, UnstructuredEmailLoader, UnstructuredXMLLoader, TextLoader, UnstructuredHTMLLoader
12
12
 
13
13
  class LoaderConfig(BaseModel):
14
14
  loader: type[BaseLoader]
@@ -22,39 +22,46 @@ class Loader():
22
22
 
23
23
  _list: dict[str, LoaderConfig | None] = {
24
24
  '.json': LoaderConfig(loader=JsonLoader),
25
- '.csv': LoaderConfig(loader=UnstructuredLoader),
26
- '.xls': LoaderConfig(loader=UnstructuredLoader),
27
- '.xlsx': LoaderConfig(loader=UnstructuredLoader),
28
- '.eml': LoaderConfig(loader=UnstructuredLoader),
29
- '.msg': LoaderConfig(loader=UnstructuredLoader),
25
+ '.csv': LoaderConfig(loader=CSVLoader),
26
+ '.xls': None,
27
+ '.xlsx': LoaderConfig(loader=DoclingLoader),
28
+ '.eml': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
29
+ '.msg': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
30
30
  '.epub': None,
31
- '.md': LoaderConfig(loader=UnstructuredLoader),
31
+ '.md': LoaderConfig(loader=TextLoader),
32
32
  '.org': None,
33
33
  '.odt': None,
34
34
  '.ppt': None,
35
- '.pptx': LoaderConfig(loader=UnstructuredLoader),
36
- '.txt': LoaderConfig(loader=UnstructuredLoader),
35
+ '.pptx': LoaderConfig(loader=UnstructuredPowerPointLoader,kwargs={"strategy":"auto"}), #docling issue with WMF https://github.com/DS4SD/docling/issues/594
36
+ '.txt': LoaderConfig(loader=TextLoader),
37
37
  '.rst': None,
38
38
  '.rtf': None,
39
39
  '.tsv': None,
40
40
  '.text': None,
41
41
  '.log': None,
42
- '.htm': LoaderConfig(loader=UnstructuredLoader),
43
- '.html': LoaderConfig(loader=UnstructuredLoader),
44
- '.pdf': LoaderConfig(loader=UnstructuredLoader,kwargs={
45
- 'strategy':'ocr_only', #https://docs.unstructured.io/open-source/core-functionality/partitioning auto,ocr_only,hi_res
46
- 'split_pdf_page': False,
47
- 'chunking_strategy': 'basic',
48
- 'max_characters': 1000000,
49
- 'include_page_breaks': True,
50
- 'include_orig_elements': False}),
51
- '.png': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
52
- '.jpg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
53
- '.jpeg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
42
+ '.htm': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
43
+ '.html': LoaderConfig(loader=UnstructuredHTMLLoader,kwargs={"strategy":"auto"}),
44
+ ".pdf": LoaderConfig(loader=DoclingLoader),
45
+ #'.pdf': LoaderConfig(loader=UnstructuredLoader,kwargs={
46
+ # 'strategy':'ocr_only', #https://docs.unstructured.io/open-source/core-functionality/partitioning auto,ocr_only,hi_res
47
+ # 'split_pdf_page': False,
48
+ # 'chunking_strategy': 'basic',
49
+ # 'max_characters': 10_000,
50
+ # 'include_page_breaks': True,
51
+ # 'include_orig_elements': False}),
52
+ #'.png': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
53
+ #'.jpg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
54
+ #'.jpeg': LoaderConfig(loader=UnstructuredLoader,kwargs={"strategy":"ocr_only"}),
55
+ '.png': LoaderConfig(loader=DoclingLoader),
56
+ '.jpg': LoaderConfig(loader=DoclingLoader),
57
+ '.jpeg': LoaderConfig(loader=DoclingLoader),
58
+ '.gif': None,
59
+ ".emf": None,
60
+ ".wmf": None,
54
61
  '.tiff': None,
55
62
  '.doc': None, #see liberoffice dependency
56
- '.docx': LoaderConfig(loader=UnstructuredLoader),
57
- '.xml': LoaderConfig(loader=UnstructuredLoader),
63
+ '.docx': LoaderConfig(loader=DoclingLoader),
64
+ '.xml': LoaderConfig(loader=UnstructuredXMLLoader,kwargs={"strategy":"auto"}),
58
65
  '.js': None,
59
66
  '.py': None,
60
67
  '.c': None,
@@ -78,7 +85,7 @@ class Loader():
78
85
  loader_configs = {}
79
86
  for ext, loader_config in Loader._list.items():
80
87
  if loader_config:
81
- if all([self._runtime_options.loader_strategy != "",loader_config.kwargs,"strategy" in loader_config.kwargs]): # type: ignore
88
+ if all([self._runtime_options.loader_strategy != "",loader_config.kwargs and "strategy" in loader_config.kwargs]): # type: ignore
82
89
  loader_config.kwargs["strategy"] = self._runtime_options.loader_strategy # type: ignore
83
90
  loader_key = (loader_config.loader, tuple(loader_config.kwargs.items())) # type: ignore
84
91
  if loader_key not in loader_configs:
@@ -119,7 +126,7 @@ class Loader():
119
126
  return _documents
120
127
  except Exception as e:
121
128
  logging.warning(f"Attempt {attempt+1} load document failed: {e}")
122
- await asyncio.sleep(1)
129
+ await asyncio.sleep(2)
123
130
  if attempt == MAX_RETRIES - 1:
124
131
  tb = traceback.format_exc()
125
132
  logging.error(f"Failed to load documents: {e} | {tb}")
@@ -0,0 +1,35 @@
1
+ import os, logging, traceback
2
+ from typing import Iterator, AsyncIterator, Optional
3
+ from langchain_core.document_loaders import BaseLoader
4
+ from langchain_core.documents import Document
5
+ from langchain_core.runnables import run_in_executor
6
+ from docling.document_converter import DocumentConverter, ConversionResult, ConversionStatus
7
+
8
+ class DoclingLoader(BaseLoader):
9
+ def __init__(self, file_path: str | list[str]) -> None:
10
+ self._file_paths = file_path if isinstance(file_path, list) else [file_path]
11
+ self._converter = DocumentConverter()
12
+ def load(self) -> list[Document]:
13
+ """Load data into Document objects."""
14
+ return list(self.lazy_load())
15
+ async def aload(self) -> list[Document]:
16
+ """Load data into Document objects."""
17
+ return [document async for document in self.alazy_load()]
18
+ async def alazy_load(self) -> AsyncIterator[Document]:
19
+ """A lazy loader for Documents."""
20
+ iterator = await run_in_executor(None, self.lazy_load)
21
+ done = object()
22
+ while True:
23
+ doc = await run_in_executor(None, next, iterator, done) # type: ignore[call-arg, arg-type]
24
+ if doc is done:
25
+ break
26
+ yield doc # type: ignore[misc]
27
+ def lazy_load(self) -> Iterator[Document]:
28
+ for source in self._file_paths:
29
+ try:
30
+ _result = self._converter.convert(os.path.abspath(source),raises_on_error=True)
31
+ doc = _result.document
32
+ text = doc.export_to_markdown()
33
+ yield Document(page_content=text, metadata={"source": source})
34
+ except Exception as e:
35
+ logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.23
3
+ Version: 0.0.25
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -21,21 +21,20 @@ Requires-Dist: langchain-openai==0.2.10
21
21
  Requires-Dist: langchain-community==0.3.8
22
22
  Requires-Dist: langchain-core==0.3.21
23
23
  Requires-Dist: faiss-cpu==1.9.0
24
- Requires-Dist: python-magic==0.4.27
25
- Requires-Dist: opencv-python-headless==4.10.0.84
26
- Requires-Dist: unstructured[all-docs]==0.16.11
27
- Requires-Dist: langchain_unstructured==0.1.5
24
+ Requires-Dist: unstructured==0.16.11
25
+ Requires-Dist: unstructured[image]
28
26
  Requires-Dist: unstructured-ingest==0.3.8
29
27
  Requires-Dist: unstructured-ingest[azure]
30
28
  Requires-Dist: unstructured-ingest[confluence]
31
29
  Requires-Dist: unstructured-ingest[dropbox]
32
30
  Requires-Dist: unstructured-ingest[gcs]
33
- Requires-Dist: unstructured-ingest[google_drive]
34
31
  Requires-Dist: unstructured-ingest[github]
32
+ Requires-Dist: unstructured-ingest[google_drive]
35
33
  Requires-Dist: unstructured-ingest[jira]
36
34
  Requires-Dist: unstructured-ingest[s3]
37
- Requires-Dist: unstructured-ingest[slack]
38
35
  Requires-Dist: unstructured-ingest[sftp]
36
+ Requires-Dist: unstructured-ingest[sharepoint]
37
+ Requires-Dist: unstructured-ingest[slack]
39
38
  Requires-Dist: html5lib==1.1
40
39
  Requires-Dist: markdownify==0.14.1
41
40
  Requires-Dist: nebuly==0.3.33
@@ -214,11 +213,22 @@ launch debugger
214
213
  streamlit run debugger.py --server.port 6002
215
214
  ```
216
215
 
216
+ dockerize base image
217
+
218
+ ```pwsh
219
+ <# cpu #>
220
+ docker build -f Dockerfile-robot-base-cpu -t ghcr.io/websolutespa/ws-bom-robot-base:cpu .
221
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:cpu
222
+ <# gpu #>
223
+ docker build -f Dockerfile-robot-base-gpu -t ghcr.io/websolutespa/ws-bom-robot-base:gpu .
224
+ docker push ghcr.io/websolutespa/ws-bom-robot-base:gpu
225
+ ```
226
+
217
227
  dockerize app from src
218
228
 
219
229
  ```pwsh
220
230
  docker build -f Dockerfile-src -t ws-bom-robot-app:src .
221
- docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
231
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -v "$(pwd)/.data:/app/.data" -p 6001:6001 ws-bom-robot-app:src
222
232
  ```
223
233
 
224
234
  ### ✈️ publish
@@ -7,20 +7,20 @@ ws_bom_robot_app/task_manager.py,sha256=Zedzs2R3O-wNSQOqs4jorgFwPRi-ji_0TN4mGfk-
7
7
  ws_bom_robot_app/util.py,sha256=b49ItlZgh2Wzw-6K8k5Wa44eVgjQ0JmWQwJnEaQBVGw,3502
8
8
  ws_bom_robot_app/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  ws_bom_robot_app/llm/agent_description.py,sha256=SDJYMmwfdMxEK3a_HDEQ19bfNKmwMSFf5hqU0VSCCIE,4705
10
- ws_bom_robot_app/llm/agent_handler.py,sha256=hbOf9i-ynDL3bcClqtUG-yWY8ohbCxONfT5ek9Cv0gY,5667
10
+ ws_bom_robot_app/llm/agent_handler.py,sha256=Qz3h1eZdA6pkurEbr8sQwl-0FdjugaO5Q9sB8f7iD9I,5808
11
11
  ws_bom_robot_app/llm/agent_lcel.py,sha256=jkSLMy6y_ZFvWT8bhBBYHY5CO-ea8oMSPMXMahFUBFc,2666
12
- ws_bom_robot_app/llm/api.py,sha256=5cO49yhU5EXvl20zJORmrZZPc1G_nlvftdcF7cyn4Qc,3252
12
+ ws_bom_robot_app/llm/api.py,sha256=vBu_TFTlBjp7e3J-WmlZbXn_TbB550x-NpQN4YsO7To,3004
13
13
  ws_bom_robot_app/llm/defaut_prompt.py,sha256=pn5a4lNLWE1NngHYjA_7tD8GasePMgsgude5fIJxsW0,756
14
- ws_bom_robot_app/llm/main.py,sha256=oD8dPvoEYD2MK8dU8tjdqBmPxxBcILSSPkQzNlMGayk,3712
14
+ ws_bom_robot_app/llm/main.py,sha256=_uW3Iy9iPJbxDfpyoReu3mbYY8a9dS1V6tZU-z6BELo,3547
15
15
  ws_bom_robot_app/llm/settings.py,sha256=EkFGCppORenStH9W4e6_dYvQ-5p6xiEMpmUHBqNqG9M,117
16
16
  ws_bom_robot_app/llm/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  ws_bom_robot_app/llm/models/api.py,sha256=KlVUbApyz6uuWefAN9K4B_vWDSps5hLW6hNg1Eo3TBQ,6996
18
18
  ws_bom_robot_app/llm/models/base.py,sha256=1TqxuTK3rjJEALn7lvgoen_1ba3R2brAgGx6EDTtDZo,152
19
19
  ws_bom_robot_app/llm/models/kb.py,sha256=9zqwDlVULVrWE48wo5AivzWoOtnjA57k9rsw8KNnyDk,8935
20
20
  ws_bom_robot_app/llm/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- ws_bom_robot_app/llm/tools/tool_builder.py,sha256=rkYu0PrXV84PMi7INjCSWlrWMykUCI8aeF-QjZgLysM,854
22
- ws_bom_robot_app/llm/tools/tool_manager.py,sha256=ZRUzSIrHKrJANc7hrp7st8qRZiy_S-_aN8fIgk7g0U8,4353
23
- ws_bom_robot_app/llm/tools/utils.py,sha256=1uOJGcFKiIDOipLCokEMdlxJLRQpgpOavm1g5-FLjMU,1307
21
+ ws_bom_robot_app/llm/tools/tool_builder.py,sha256=z9SdwD6dJaJbLByHIGUaIbqbNm33an9agNnm5njSb6Q,901
22
+ ws_bom_robot_app/llm/tools/tool_manager.py,sha256=DzJLQCLBb2jesOx2rR56_z3KyWqwJpvUGD16ImxOj34,4495
23
+ ws_bom_robot_app/llm/tools/utils.py,sha256=yT8dJ2pywCJb-6VlgltVPEa4-b3XT8UYWUqW9m1cKWo,1307
24
24
  ws_bom_robot_app/llm/tools/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  ws_bom_robot_app/llm/tools/models/main.py,sha256=LsOJ7vkcSzYLoE1oa3TG0Rs0pr9J5VS_e4li6aDx_fw,260
26
26
  ws_bom_robot_app/llm/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -28,28 +28,30 @@ ws_bom_robot_app/llm/utils/agent_utils.py,sha256=LEfAKQwFrwmIdJL0o54iuGrir9uLcJh
28
28
  ws_bom_robot_app/llm/utils/download.py,sha256=iAUxH_NiCpTPtGzhC4hBtxotd2HPFt2MBhttslIxqiI,3194
29
29
  ws_bom_robot_app/llm/utils/faiss_helper.py,sha256=69juxptz1gidgxVOrqNvJajRl40p5-ugHqyEBDtnSKo,5036
30
30
  ws_bom_robot_app/llm/utils/kb.py,sha256=jja45WCbNI7SGEgqDS99nErlwB5eY8Ga7BMnhdMHZ90,1279
31
- ws_bom_robot_app/llm/utils/print.py,sha256=bpLWY0KHXe7x7PWcWy8NS54ZWzHY8b4jrLRkpnDl108,818
31
+ ws_bom_robot_app/llm/utils/print.py,sha256=ZonoLPcfM6Cpw4_Ec455LiCovExOwvnIgvw1QORSCBY,799
32
32
  ws_bom_robot_app/llm/utils/webhooks.py,sha256=LAAZqyN6VhV13wu4X-X85TwdDgAV2rNvIwQFIIc0FJM,2114
33
33
  ws_bom_robot_app/llm/vector_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- ws_bom_robot_app/llm/vector_store/generator.py,sha256=aVUG08rcsDvtVxfYfUalO1CbKGkLazNyP555IsQQZso,5975
34
+ ws_bom_robot_app/llm/vector_store/generator.py,sha256=Xg-srcH_03lqPHkMn1EXP56GbY1CYa2zIbjvNfcQqyM,6192
35
35
  ws_bom_robot_app/llm/vector_store/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  ws_bom_robot_app/llm/vector_store/integration/azure.py,sha256=R37TaPQP-HJJJiaKE9rmMc9kpeXeRvdebbTY_982om0,3392
37
37
  ws_bom_robot_app/llm/vector_store/integration/base.py,sha256=IvIu8RkISuurrVKr2YPG96fsOI2kqhaEGyTGzjB-jCI,1550
38
38
  ws_bom_robot_app/llm/vector_store/integration/confluence.py,sha256=4fiRHB3J-SHZZxNGHwVkCrT-xSPbc91z4WrDE9fy6xU,2505
39
- ws_bom_robot_app/llm/vector_store/integration/dropbox.py,sha256=Am0jfh54OFUTQihqmkwgKIkcEmjehDh_0NQzBuxTdGY,2481
39
+ ws_bom_robot_app/llm/vector_store/integration/dropbox.py,sha256=yhGvHTN0TEpUfhdvvV7RX5MxBwTUyddAX95Fgqp3mCg,2629
40
40
  ws_bom_robot_app/llm/vector_store/integration/gcs.py,sha256=fFDVDUR6eNB7FVTzDSEpMHFEWMgG16GLnpSf_mqGDdE,3184
41
41
  ws_bom_robot_app/llm/vector_store/integration/github.py,sha256=18PO30AZcgTn6PHhid3MwImVAdmKBNkr0kmAPgOetGw,2663
42
42
  ws_bom_robot_app/llm/vector_store/integration/googledrive.py,sha256=R6hr8iEgrR3QMOzIj5jY6w1x8pZ1LGdh4xM_q7g_ttc,3738
43
43
  ws_bom_robot_app/llm/vector_store/integration/jira.py,sha256=o5iINIblp_yNszp54nf7fW97aqjs0A5G89N8sYrd1ds,2771
44
- ws_bom_robot_app/llm/vector_store/integration/manager.py,sha256=_aNiy3w_fnWOwaGkp_X_hDNstnPbxtCuojikBzRSArE,1583
44
+ ws_bom_robot_app/llm/vector_store/integration/manager.py,sha256=5Fl3XML6f1wmgraigpUwIFIXh7QFPX0RI0YFgFxBAvg,1700
45
45
  ws_bom_robot_app/llm/vector_store/integration/s3.py,sha256=3kh-VmH84IW7DdSLvOk6td1VBJ9aohlVJsk5F3cYj0U,3320
46
46
  ws_bom_robot_app/llm/vector_store/integration/sftp.py,sha256=WNzjjS1EUykgFB-8e7QkecSa1r1jTJqKyGzR25uJCtM,2848
47
+ ws_bom_robot_app/llm/vector_store/integration/sharepoint.py,sha256=zqqn-6qPrK50Phch4nZHJTgaPyPkGe7W2InGL_Ru6vE,5376
47
48
  ws_bom_robot_app/llm/vector_store/integration/sitemap.py,sha256=nPbIywp-ZwWbWStvjvYVgHqqejyYFr8eZhBc8ycTuaU,4206
48
49
  ws_bom_robot_app/llm/vector_store/integration/slack.py,sha256=FMjESXm2QetFXI6i8epze7Kbbu22fV8CVaxb71AHnJ8,2572
49
50
  ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
- ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=ZvcyUPOoQ44gyfl80Jf0y9vbcj8uTUb-lYkb_m2pt1I,5137
51
+ ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=SWV7T6BcsV8nvnUAHHZ9Q2oFUEnfwM33jpJCry5vbIA,5847
52
+ ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=aHHfMf2JsZo0o6jrRDlImY0Oi9NFhVQk8Wg5ePAPa50,1721
51
53
  ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=qo9ejRZyKv_k6jnGgXnu1W5uqsMMtgqK_uvPpZQ0p74,833
52
- ws_bom_robot_app-0.0.23.dist-info/METADATA,sha256=tWzTgX6aPIvqV-chg-GyxPQCogt14pzEPKM3DJLR0yw,7152
53
- ws_bom_robot_app-0.0.23.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
54
- ws_bom_robot_app-0.0.23.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
55
- ws_bom_robot_app-0.0.23.dist-info/RECORD,,
54
+ ws_bom_robot_app-0.0.25.dist-info/METADATA,sha256=TObdL0LhroQrJaqOUTwLEY9gqyk_ct-yDPJzcOWps_w,7478
55
+ ws_bom_robot_app-0.0.25.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
56
+ ws_bom_robot_app-0.0.25.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
57
+ ws_bom_robot_app-0.0.25.dist-info/RECORD,,