PyPI - ws-bom-robot-app - Versions diffs - 0.0.37__py3-none-any.whl → 0.0.103__py3-none-any.whl - Mend

ws-bom-robot-app 0.0.37py3-none-any.whl → 0.0.103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

ws_bom_robot_app/config.py +35 -7
ws_bom_robot_app/cron_manager.py +15 -14
ws_bom_robot_app/llm/agent_context.py +26 -0
ws_bom_robot_app/llm/agent_description.py +123 -123
ws_bom_robot_app/llm/agent_handler.py +176 -180
ws_bom_robot_app/llm/agent_lcel.py +107 -54
ws_bom_robot_app/llm/api.py +100 -7
ws_bom_robot_app/llm/defaut_prompt.py +15 -15
ws_bom_robot_app/llm/evaluator.py +319 -0
ws_bom_robot_app/llm/feedbacks/__init__.py +0 -0
ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -0
ws_bom_robot_app/llm/main.py +159 -110
ws_bom_robot_app/llm/models/api.py +70 -5
ws_bom_robot_app/llm/models/feedback.py +30 -0
ws_bom_robot_app/llm/nebuly_handler.py +185 -0
ws_bom_robot_app/llm/providers/llm_manager.py +244 -80
ws_bom_robot_app/llm/tools/models/main.py +8 -0
ws_bom_robot_app/llm/tools/tool_builder.py +68 -23
ws_bom_robot_app/llm/tools/tool_manager.py +343 -133
ws_bom_robot_app/llm/tools/utils.py +41 -25
ws_bom_robot_app/llm/utils/agent.py +34 -0
ws_bom_robot_app/llm/utils/chunker.py +6 -1
ws_bom_robot_app/llm/utils/cleanup.py +81 -0
ws_bom_robot_app/llm/utils/cms.py +123 -0
ws_bom_robot_app/llm/utils/download.py +183 -79
ws_bom_robot_app/llm/utils/print.py +29 -29
ws_bom_robot_app/llm/vector_store/db/__init__.py +0 -0
ws_bom_robot_app/llm/vector_store/db/base.py +193 -0
ws_bom_robot_app/llm/vector_store/db/chroma.py +97 -0
ws_bom_robot_app/llm/vector_store/db/faiss.py +91 -0
ws_bom_robot_app/llm/vector_store/db/manager.py +15 -0
ws_bom_robot_app/llm/vector_store/db/qdrant.py +73 -0
ws_bom_robot_app/llm/vector_store/generator.py +137 -137
ws_bom_robot_app/llm/vector_store/integration/api.py +216 -0
ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/base.py +58 -15
ws_bom_robot_app/llm/vector_store/integration/confluence.py +41 -11
ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
ws_bom_robot_app/llm/vector_store/integration/googledrive.py +46 -17
ws_bom_robot_app/llm/vector_store/integration/jira.py +112 -75
ws_bom_robot_app/llm/vector_store/integration/manager.py +6 -2
ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
ws_bom_robot_app/llm/vector_store/integration/sitemap.py +9 -1
ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -0
ws_bom_robot_app/llm/vector_store/loader/base.py +52 -8
ws_bom_robot_app/llm/vector_store/loader/docling.py +71 -33
ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
ws_bom_robot_app/main.py +148 -146
ws_bom_robot_app/subprocess_runner.py +106 -0
ws_bom_robot_app/task_manager.py +207 -54
ws_bom_robot_app/util.py +65 -20
ws_bom_robot_app-0.0.103.dist-info/METADATA +364 -0
ws_bom_robot_app-0.0.103.dist-info/RECORD +76 -0
{ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/WHEEL +1 -1
ws_bom_robot_app/llm/settings.py +0 -4
ws_bom_robot_app/llm/utils/agent_utils.py +0 -17
ws_bom_robot_app/llm/utils/kb.py +0 -34
ws_bom_robot_app-0.0.37.dist-info/METADATA +0 -277
ws_bom_robot_app-0.0.37.dist-info/RECORD +0 -60
{ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/top_level.txt +0 -0

ws_bom_robot_app/llm/vector_store/integration/shopify.py ADDED Viewed

@@ -0,0 +1,143 @@
+import asyncio, logging, aiohttp
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import List, Union, Optional
+from pydantic import BaseModel, Field, AliasChoices, field_validator
+import json
+import os
+class ShopifyParams(BaseModel):
+  """
+  ShopifyParams is a model that defines the parameters required for Shopify integration.
+  Attributes:
+    shop_name (str): The shop name for Shopify.
+    access_token (str): The access token for Shopify.
+    graphql_query (Union[str, dict]): The GraphQL query string or dict for Shopify.
+  """
+  shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
+  access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
+  graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
+  @field_validator('graphql_query')
+  @classmethod
+  def extract_query_string(cls, v):
+    """Extract the query string from dict format if needed"""
+    if isinstance(v, dict) and 'query' in v:
+      return v['query']
+    return v
+class Shopify(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = ShopifyParams.model_validate(self.data)
+  def working_subdirectory(self) -> str:
+    return 'shopify'
+  async def run(self) -> None:
+    _data = await self.__get_data()
+    json_file_path = os.path.join(self.working_directory, 'shopify_data.json')
+    with open(json_file_path, 'w', encoding='utf-8') as f:
+      json.dump(_data, f, ensure_ascii=False)
+  async def load(self) -> list[Document]:
+    await self.run()
+    await asyncio.sleep(1)
+    return await Loader(self.working_directory).load()
+  async def __get_data(self, page_size: int = 50) -> List[dict]:
+      # URL dell'API
+    url = f"https://{self.__data.shop_name}.myshopify.com/admin/api/2024-07/graphql.json"
+    # Headers
+    headers = {
+        "X-Shopify-Access-Token": self.__data.access_token,
+        "Content-Type": "application/json"
+    }
+    all_data: List[dict] = []
+    has_next_page = True
+    cursor = None
+    retry_count = 0
+    max_retries = 5
+    while has_next_page:
+        # Variables per la query
+        variables = {
+            "first": page_size
+        }
+        if cursor:
+            variables["after"] = cursor
+        # Payload della richiesta
+        payload = {
+            "query": self.__data.graphql_query,
+            "variables": variables
+        }
+        try:
+            # Effettua la richiesta
+            async with aiohttp.ClientSession() as session:
+              async with session.post(url, headers=headers, json=payload) as response:
+                # Controlla se la risposta è JSON
+                try:
+                  data = await response.json()
+                except aiohttp.ContentTypeError:
+                  text = await response.text()
+                  logging.error(f"Non-JSON response received. Status code: {response.status}")
+                  logging.error(f"Content: {text}")
+                  raise Exception("Invalid response from API")
+            # Gestione del throttling
+            if "errors" in data:
+                error = data["errors"][0]
+                if error.get("extensions", {}).get("code") == "THROTTLED":
+                    retry_count += 1
+                    if retry_count > max_retries:
+                        raise Exception("Too many throttling attempts. Stopping execution.")
+                    # Aspetta un po' più a lungo ad ogni tentativo
+                    wait_time = 2 ** retry_count  # Backoff esponenziale
+                    print(f"Rate limit reached. Waiting {wait_time} seconds... (Attempt {retry_count}/{max_retries})")
+                    await asyncio.sleep(wait_time)
+                    continue
+                else:
+                    raise Exception(f"GraphQL errors: {data['errors']}")
+            # Resetta il contatore dei retry se la richiesta è andata bene
+            retry_count = 0
+            # Estrae i dati
+            _data = list(data["data"].values())[0]
+            edges = _data["edges"]
+            page_info = _data["pageInfo"]
+            # Aggiungi i dati alla lista
+            for edge in edges:
+                all_data.append(edge["node"])
+            # Aggiorna il cursore e il flag per la paginazione
+            has_next_page = page_info["hasNextPage"]
+            cursor = page_info["endCursor"]
+            print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_data)}")
+            # Piccola pausa per evitare di saturare l'API
+            await asyncio.sleep(0.1)
+        except aiohttp.ClientError as e:
+            logging.error(f"Connection error: {e}")
+            retry_count += 1
+            if retry_count <= max_retries:
+                wait_time = 2 ** retry_count
+                logging.warning(f"Retrying in {wait_time} seconds...")
+                await asyncio.sleep(wait_time)
+                continue
+            else:
+                raise Exception("Too many network errors. Stopping execution.")
+    logging.info(f"Data retrieval completed! Total data: {len(all_data)}")
+    return all_data

ws_bom_robot_app/llm/vector_store/integration/sitemap.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import sys, asyncio
 from typing import Any, AsyncGenerator, AsyncIterator
 import aiofiles
 import aiofiles.os
@@ -20,6 +21,7 @@ class Sitemap(IntegrationStrategy):
         data["excludeTag"] (list[str]): default to ["script", "noscript", "style", "head", "header","nav","footer", "iframe"]
         data["excludeClass"] (list[str]): ["class1", "class2"]
         data["excludeId"] (list[str]): ["id1", "id2"]
+        data["restrictDomain"] (bool): if True, only urls from the same domain will be loaded, default to True
     """
     def __init__(self, knowledgebase_path: str, data: dict[str, Any]):
         super().__init__(knowledgebase_path, data)
@@ -30,6 +32,8 @@ class Sitemap(IntegrationStrategy):
         self.__exclude_tag: list[str] = self.data.get("excludeTag",[]) # type: ignore
         self.__exclude_class: list[str] = self.data.get("excludeClass",[]) # type: ignore
         self.__exclude_id: list[str] = self.data.get("excludeId",[]) # type: ignore
+        self.__restrict_to_same_domain: bool = self.data.get("restrictDomain", True) # type: ignore
+        self.__header_template = self.data.get("headers", None)
     def working_subdirectory(self) -> str:
         return ""
     def _extract(self, tag: Tag) -> str:
@@ -62,6 +66,8 @@ class Sitemap(IntegrationStrategy):
         return f"{self.knowledgebase_path}/{url}" if self._is_local(url) else url
     async def alazy_load(self,loader: SitemapLoader) -> AsyncIterator[Document]:
         """A lazy loader for Documents."""
+        if sys.platform == 'win32':
+          asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
         iterator = await run_in_executor(None, loader.lazy_load)
         done = object()
         while True:
@@ -75,7 +81,9 @@ class Sitemap(IntegrationStrategy):
                 web_path=self._remap_if_local(self.__sitemap_url),
                 filter_urls=self.__filter_urls,
                 parsing_function=self._parse,
-                is_local=self._is_local(self.__sitemap_url)
+                is_local=self._is_local(self.__sitemap_url),
+                restrict_to_same_domain=self.__restrict_to_same_domain,
+                header_template=self.__header_template
             )
             _docs = self._output([document async for document in self.alazy_load(_loader)])
             if self._is_local(self.__sitemap_url):

ws_bom_robot_app/llm/vector_store/integration/slack.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
+from unstructured_ingest.interfaces.downloader import DownloaderConfig
+from unstructured_ingest.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Union
@@ -39,7 +40,7 @@ class Slack(IntegrationStrategy):
       start_date=datetime.now() - timedelta(days=self.__data.num_days),
       end_date=datetime.now()
     )
-    downloader_config = SlackDownloaderConfig(
+    downloader_config = DownloaderConfig(
       download_dir=self.working_directory
     )
     connection_config = SlackConnectionConfig(

ws_bom_robot_app/llm/vector_store/integration/thron.py ADDED Viewed

@@ -0,0 +1,236 @@
+import asyncio, logging, aiohttp
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import List, Union, Optional
+from pydantic import BaseModel, Field, AliasChoices
+import json
+import os
+import platform
+import pandas as pd
+from io import BytesIO
+# Fix for Windows event loop issue with aiodns
+if platform.system() == 'Windows':
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+class ThronParams(BaseModel):
+  """
+  ThronParams is a model that defines the parameters required for Thron integration.
+  Attributes:
+    app_id (str): The application ID for Thron.
+    client_id (str): The client ID for Thron.
+    client_secret (str): The client secret for Thron.
+  """
+  organization_name: str = Field(validation_alias=AliasChoices("organizationName","organization_name"))
+  attribute_fields: Optional[List[str]] = Field(default=None, validation_alias=AliasChoices("attributeFields","attribute_fields"))
+  client_id: str = Field(validation_alias=AliasChoices("clientId","client_id"))
+  client_secret: str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
+class Thron(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__token = None
+    self.__data = ThronParams.model_validate(self.data)
+  def working_subdirectory(self) -> str:
+    return 'thron'
+  async def __ensure_token(self) -> bool:
+    """Ensure we have a valid token, getting one if needed."""
+    if not self.__token:
+      self.__token = await self.__get_auth_token()
+    return self.__token is not None
+  def __convert_xlsx_to_csv(self, file_content: bytes) -> bool:
+    """Convert XLSX file content to CSV and save to working directory."""
+    try:
+      df = pd.read_excel(BytesIO(file_content))
+      csv_path = os.path.join(self.working_directory, 'thron_export.csv')
+      df.to_csv(csv_path, index=False, encoding='utf-8')
+      return True
+    except Exception as e:
+      logging.error(f"Error converting XLSX to CSV: {e}")
+      return False
+  async def run(self) -> None:
+    _run_id = await self.__get_data()
+    if _run_id:
+      await self.__fetch_exported_file(_run_id)
+  async def load(self) -> list[Document]:
+    await self.run()
+    await asyncio.sleep(1)
+    return await Loader(self.working_directory).load()
+  async def __get_auth_token(self) -> str:
+    """
+    Get authentication token from Thron API.
+    Returns:
+      str: The access token if successful, None otherwise.
+    """
+    try:
+      async with aiohttp.ClientSession() as session:
+        auth_data = {
+          "grant_type": "client_credentials",
+          "client_id": self.__data.client_id,
+          "client_secret": self.__data.client_secret
+        }
+        headers = {
+          "accept": "application/json",
+          "Content-Type": "application/x-www-form-urlencoded"
+        }
+        async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/authentication/oauth2/token", data=auth_data, headers=headers) as response:
+          result = await response.json()
+          return result.get("access_token", "")
+    except Exception as e:
+      logging.error(f"Error fetching Thron auth token: {e}")
+      return None
+  async def __refresh_token(self) -> bool:
+    """Refresh the authentication token and update the instance variable."""
+    try:
+      new_token = await self.__get_auth_token()
+      if new_token:
+        self.__token = new_token
+        logging.info("Thron authentication token refreshed successfully.")
+        return True
+      else:
+        logging.error("Failed to refresh Thron authentication token.")
+        return False
+    except Exception as e:
+      logging.error(f"Error refreshing Thron auth token: {e}")
+      return False
+  async def __get_data(self) -> str:
+    """
+    Initiates a data export request to Thron API.
+    Returns:
+      str: The export ID if successful, None otherwise.
+    """
+    max_retries = 2
+    retry_count = 0
+    while retry_count < max_retries:
+      try:
+        if not await self.__ensure_token():
+          logging.error("Failed to obtain Thron authentication token.")
+          return {}
+        async with aiohttp.ClientSession() as session:
+          headers = {
+            "accept": "application/json",
+            "Authorization": f"Bearer {self.__token}"
+          }
+          payload = {"attributes": self.__data.attribute_fields or [],"assetsBy":"CODE","type":"CODES","format":"XLSX","locales":[],"systemAttributes":["family","master","variation","variationGroup","hierarchyLevel"]}
+          async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports", headers=headers, json=payload) as response:
+            # Check for authentication errors
+            if response.status == 401:
+              logging.warning("Authentication failed in __get_data, attempting to refresh token...")
+              if await self.__refresh_token():
+                retry_count += 1
+                continue
+              else:
+                logging.error("Token refresh failed in __get_data.")
+                return None
+            if response.status not in range(200, 300):
+              logging.error(f"API request failed with status {response.status}")
+              return None
+            result = await response.json()
+            return result.get("id", None)
+      except Exception as e:
+        logging.error(f"Error fetching Thron product data (attempt {retry_count + 1}): {e}")
+        if retry_count < max_retries - 1:
+          if await self.__refresh_token():
+            retry_count += 1
+            continue
+        retry_count += 1
+    logging.error(f"Failed to fetch Thron product data after {max_retries} attempts.")
+    return {}
+  async def __fetch_exported_file(self, export_id: str) -> bool:
+    """
+    Fetches the exported file from Thron API using the provided export ID.
+    Polls the export status until it's processed, then downloads the XLSX file
+    and converts it to CSV format in the working directory.
+    Args:
+      export_id (str): The ID of the export to fetch.
+    Returns:
+      bool: True if file was successfully downloaded and converted, False otherwise.
+    """
+    max_retries = 2
+    retry_count = 0
+    while retry_count < max_retries:
+      try:
+        # Ensure we have a token
+        if not await self.__ensure_token():
+          logging.error("Failed to obtain Thron authentication token.")
+          return {}
+        async with aiohttp.ClientSession() as session:
+          headers = {
+            "accept": "application/json",
+            "Authorization": f"Bearer {self.__token}"
+          }
+          # Polling until status is PROCESSED
+          while True:
+            async with session.get(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports/{export_id}", headers=headers) as response:
+              # Check for authentication errors
+              if response.status == 401:
+                logging.warning("Authentication failed, attempting to refresh token...")
+                if await self.__refresh_token():
+                  headers["Authorization"] = f"Bearer {self.__token}"
+                  continue
+                else:
+                  logging.error("Token refresh failed, aborting request.")
+                  return {}
+              if response.status != 200:
+                logging.error(f"API request failed with status {response.status}")
+                break
+              result = await response.json()
+              if result.get("status") == "PROCESSED":
+                download_uri = result.get("downloadUri")
+                if download_uri:
+                  async with session.get(download_uri) as file_response:
+                    if file_response.status == 200:
+                      # Download XLSX file
+                      file_content = await file_response.read()
+                      return self.__convert_xlsx_to_csv(file_content)
+                    elif file_response.status == 401:
+                      logging.warning("Authentication failed during file download, attempting to refresh token...")
+                      if await self.__refresh_token():
+                        retry_count += 1
+                        break
+                      else:
+                        logging.error("Token refresh failed during file download.")
+                        return False
+                break
+              await asyncio.sleep(5)
+        return False
+      except Exception as e:
+        logging.error(f"Error fetching exported data (attempt {retry_count + 1}): {e}")
+        if retry_count < max_retries - 1:
+          if await self.__refresh_token():
+            retry_count += 1
+            continue
+        retry_count += 1
+    logging.error(f"Failed to fetch exported data after {max_retries} attempts.")
+    return False

ws_bom_robot_app/llm/vector_store/loader/base.py CHANGED Viewed

@@ -15,6 +15,8 @@ from langchain_community.document_loaders import (
     UnstructuredImageLoader,
     UnstructuredWordDocumentLoader,
     UnstructuredXMLLoader,
+    UnstructuredExcelLoader,
+    UnstructuredPDFLoader,
     UnstructuredPowerPointLoader,
     TextLoader
     )
@@ -30,9 +32,9 @@ class Loader():
   _list: dict[str, LoaderConfig | None] = {
     '.json': LoaderConfig(loader=JsonLoader),
-    '.csv': LoaderConfig(loader=CSVLoader),
+    '.csv': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":CSVLoader}),
     '.xls': None,
-    '.xlsx': LoaderConfig(loader=DoclingLoader),
+    '.xlsx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredExcelLoader, "strategy":"auto"}),
     '.eml': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
     '.msg': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
     '.epub': None,
@@ -47,9 +49,9 @@ class Loader():
     '.tsv': None,
     '.text': None,
     '.log': None,
-    '.htm': LoaderConfig(loader=BSHTMLLoader),
-    '.html': LoaderConfig(loader=BSHTMLLoader),
-    ".pdf": LoaderConfig(loader=DoclingLoader),
+    '.htm': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":BSHTMLLoader}),
+    '.html': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":BSHTMLLoader}),
+    ".pdf": LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredPDFLoader, "strategy":"auto"}),
     '.png': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
     '.jpg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
     '.jpeg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
@@ -59,7 +61,7 @@ class Loader():
     '.tiff': None,
     '.doc': None, #see liberoffice dependency
     '.docx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredWordDocumentLoader, "strategy":"auto"}),
-    '.xml': LoaderConfig(loader=UnstructuredXMLLoader,kwargs={"strategy":"auto"}),
+    '.xml': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredXMLLoader, "strategy":"auto"}),
     '.js': None,
     '.py': None,
     '.c': None,
@@ -102,15 +104,54 @@ class Loader():
             loader_kwargs=loader_config["loader_kwargs"],
             show_progress=self._runtime_options.loader_show_progress,
             recursive=True,
-            silent_errors=self._runtime_options.loader_silent_errors,
+            silent_errors=True, #self._runtime_options.loader_silent_errors,
             use_multithreading=config.robot_loader_max_threads>1,
-            max_concurrency=config.robot_loader_max_threads
+            max_concurrency=config.robot_loader_max_threads,
+            #sample_size=200
           )
         )
     return loaders
   #@timer
   async def load(self) -> list[Document]:
+    #region log
+    import warnings
+    warnings.filterwarnings("ignore", message=".*pin_memory.*no accelerator is found.*")
+    warnings.filterwarnings("ignore", category=UserWarning)
+    log_msg_to_ignore = [
+        "Going to convert document batch...",
+        "Initializing pipeline for",
+        "Accelerator device:",
+        "detected formats:",
+        "The text detection result is empty",
+        "RapidOCR returned empty result!",
+    ]
+    class MessageFilter(logging.Filter):
+        def __init__(self, patterns):
+            super().__init__()
+            self.log_msg_to_ignore = patterns
+        def filter(self, record):
+            for pattern in self.log_msg_to_ignore:
+                if pattern in record.getMessage():
+                    return False
+            return True
+    message_filter = MessageFilter(log_msg_to_ignore)
+    loggers_to_filter = [
+        'docling',
+        'docling.document_converter',
+        'docling.datamodel',
+        'docling.datamodel.document',
+        'docling.models',
+        'docling.models.rapidocr_model',
+        'docling.utils.accelerator_utils',
+        'unstructured',
+        'RapidOCR'
+    ]
+    for logger_name in loggers_to_filter:
+        logging.getLogger(logger_name).addFilter(message_filter)
+    #endregion log
     MAX_RETRIES = 3
     loaders: MergedDataLoader = MergedDataLoader(self.__directory_loader())
     try:
@@ -130,5 +171,8 @@ class Loader():
         finally:
            del _documents
     finally:
+      # Remove logging filters
+      for logger_name in loggers_to_filter:
+          logging.getLogger(logger_name).removeFilter(message_filter)
       del loaders
       gc.collect()

ws_bom_robot_app/llm/vector_store/loader/docling.py CHANGED Viewed

@@ -4,23 +4,52 @@ from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.runnables import run_in_executor
 from docling.document_converter import DocumentConverter, InputFormat, PdfFormatOption, ImageFormatOption
-from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode, TesseractCliOcrOptions
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode
 from langchain_community.document_loaders import UnstructuredFileLoader
+from ws_bom_robot_app.llm.vector_store.db.base import VectorDBStrategy
+from docling.datamodel.pipeline_options import TableStructureOptions, TableFormerMode, RapidOcrOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
+def _doclingConverter() -> DocumentConverter:
+  _pipeline_config = {
+      "accelerator_options": AcceleratorOptions(
+          device=AcceleratorDevice.AUTO,
+          cuda_use_flash_attention2=False,
+      ),
+      "table_structure_options": TableStructureOptions(mode=TableFormerMode.ACCURATE),
+  }
+  _base_pipeline_options = PdfPipelineOptions(
+      **_pipeline_config,
+      do_ocr=False)
+  _ocr_pipeline_options = PdfPipelineOptions(
+      **_pipeline_config,
+      ocr_options=RapidOcrOptions(
+         print_verbose=False,
+         text_score=0.5,
+         #rapidocr_params={"det_use_cuda": True}
+         ))
+  doc_converter = DocumentConverter(
+      format_options={
+          InputFormat.PDF: PdfFormatOption(
+              pipeline_options=_base_pipeline_options,
+          ),
+          InputFormat.IMAGE: ImageFormatOption(
+              pipeline_options=_ocr_pipeline_options,
+          ),
+      }
+  )
+  for frm in [InputFormat.PDF, InputFormat.IMAGE]:
+     doc_converter.initialize_pipeline(frm)
+  return doc_converter
 class DoclingLoader(BaseLoader):
+  _doc_converter: Optional[DocumentConverter] = None
   def __init__(self, file_path: str | list[str], **kwargs: Any) -> None:
       self._file_paths = file_path if isinstance(file_path, list) else [file_path]
-      self._converter = DocumentConverter(format_options={
-            InputFormat.PDF: PdfFormatOption(
-              pipeline_options=PdfPipelineOptions(
-                table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
-            )),
-            InputFormat.IMAGE: ImageFormatOption(
-              pipeline_options=PdfPipelineOptions(
-                ocr_options=TesseractCliOcrOptions(lang=["auto"]),
-                table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
-            ))
-        })
+      if DoclingLoader._doc_converter is None:
+          DoclingLoader._doc_converter = _doclingConverter()
+      self._converter = DoclingLoader._doc_converter
       self._kwargs = kwargs
   def load(self) -> list[Document]:
       """Load data into Document objects."""
@@ -37,28 +66,37 @@ class DoclingLoader(BaseLoader):
           if doc is done:
               break
           yield doc  # type: ignore[misc]
+  def _fallback_loader(self, source: str, error: Exception = None) -> Iterator[Document]:
+      if 'fallback' in self._kwargs:
+          if issubclass(self._kwargs['fallback'], (BaseLoader, UnstructuredFileLoader)):
+            logging.info(f"Using fallback loader {self._kwargs['fallback']} for {source}")
+            try:
+              loader: Union[BaseLoader, UnstructuredFileLoader] = self._kwargs['fallback'](
+                  source,
+                  **{k: v for k, v in self._kwargs.items() if k != 'fallback'}
+                  )
+              yield from loader.lazy_load()
+            except Exception as e:
+              logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
+          else:
+              logging.warning(f"Invalid fallback loader {self._kwargs['fallback']}[{type(self._kwargs['fallback'])}] for {source}")
+      else:
+        logging.warning(f"Failed to load document from {source}: {error}")
   def lazy_load(self) -> Iterator[Document]:
       for source in self._file_paths:
           try:
-            _result = self._converter.convert(
-               os.path.abspath(source),
-               raises_on_error=True)
-            doc = _result.document
-            text = doc.export_to_markdown(image_placeholder="")
-            yield Document(page_content=text, metadata={"source": source})
-          except Exception as e:
-            if 'fallback' in self._kwargs:
-                if issubclass(self._kwargs['fallback'], (BaseLoader, UnstructuredFileLoader)):
-                  logging.info(f"Using fallback loader {self._kwargs['fallback']} for {source}")
-                  try:
-                    loader: Union[BaseLoader, UnstructuredFileLoader] = self._kwargs['fallback'](
-                       source,
-                       **{k: v for k, v in self._kwargs.items() if k != 'fallback'}
-                       )
-                    yield from loader.lazy_load()
-                  except Exception as e:
-                    logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
-                else:
-                   logging.warning(f"Invalid fallback loader {self._kwargs['fallback']}[{type(self._kwargs['fallback'])}] for {source}")
+            #manage only small file with header, preventing header stripping and improper chunking
+            if (source.endswith('.csv') or source.endswith('.xlsx')) \
+                and 'fallback' in self._kwargs \
+                and os.path.getsize(source) > (VectorDBStrategy.MAX_TOKENS_PER_BATCH // 4): #rough token estimate
+              yield from self._fallback_loader(source)
             else:
-              logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
+              _result = self._converter.convert(
+                os.path.abspath(source),
+                raises_on_error=True)
+              doc = _result.document
+              text = doc.export_to_markdown(image_placeholder="")
+              yield Document(page_content=text, metadata={"source": source})
+          except Exception as e:
+             yield from self._fallback_loader(source,e)

ws-bom-robot-app 0.0.37__py3-none-any.whl → 0.0.103__py3-none-any.whl

ws-bom-robot-app 0.0.37py3-none-any.whl → 0.0.103py3-none-any.whl