PyPI - ws-bom-robot-app - Versions diffs - 0.0.80__py3-none-any.whl → 0.0.82__py3-none-any.whl - Mend

ws-bom-robot-app 0.0.80py3-none-any.whl → 0.0.82py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

ws_bom_robot_app/llm/vector_store/integration/sharepoint.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import asyncio, logging, traceback
 from dataclasses import dataclass
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.sharepoint  import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
+from unstructured_ingest.processes.connectors.sharepoint  import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Union, Optional
@@ -14,22 +14,18 @@ class SharepointParams(BaseModel):
   Attributes:
     client_id (str): The client ID for SharePoint authentication.
     client_secret (str): The client secret for SharePoint authentication.
+    tenant_id (str, optional): The tenant ID for SharePoint authentication. Defaults to None.
     site_url (str): The URL of the SharePoint site. i.e. site collection level: https://<tenant>.sharepoint.com/sites/<site-collection-name>, or root site: https://<tenant>.sharepoint.com
     site_path (str, optional): TThe path in the SharePoint site from which to start parsing files, for example "Shared Documents". Defaults to None.
     recursive (bool, optional): Whether to recursively access subdirectories. Defaults to False.
-    omit_files (bool, optional): Whether to omit files from the results. Defaults to False.
-    omit_pages (bool, optional): Whether to omit pages from the results. Defaults to False.
-    omit_lists (bool, optional): Whether to omit lists from the results. Defaults to False.
     extension (list[str], optional): A list of file extensions to include, i.e. [".pdf"]  Defaults to None.
   """
   client_id : str = Field(validation_alias=AliasChoices("clientId","client_id"))
   client_secret : str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
   site_url: str = Field(validation_alias=AliasChoices("siteUrl","site_url"))
   site_path: str = Field(default=None,validation_alias=AliasChoices("sitePath","site_path"))
+  tenant_id: str = Field(default=None, validation_alias=AliasChoices("tenantId","tenant_id"))
   recursive: bool = Field(default=False)
-  omit_files: bool = Field(default=False, validation_alias=AliasChoices("omitFiles","omit_files")),
-  omit_pages: bool = Field(default=False, validation_alias=AliasChoices("omitPages","omit_pages")),
-  omit_lists: bool = Field(default=False, validation_alias=AliasChoices("omitLists","omit_lists")),
   extension: list[str] = Field(default=None)
 class Sharepoint(IntegrationStrategy):
   def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
@@ -41,10 +37,7 @@ class Sharepoint(IntegrationStrategy):
   def run(self) -> None:
     indexer_config = SharepointIndexerConfig(
       path=self.__data.site_path,
-      recursive=self.__data.recursive,
-      omit_files=self.__data.omit_files,
-      omit_pages=self.__data.omit_pages,
-      omit_lists=self.__data.omit_lists
+      recursive=self.__data.recursive
     )
     downloader_config = SharepointDownloaderConfig(
       download_dir=self.working_directory
@@ -53,15 +46,15 @@ class Sharepoint(IntegrationStrategy):
       access_config=SharepointAccessConfig(client_cred=self.__data.client_secret),
       client_id=self.__data.client_id,
       site=self.__data.site_url,
-      permissions_config=None
+      tenant= self.__data.tenant_id if self.__data.tenant_id else None
     )
     pipeline = self.__unstructured_ingest.pipeline(
       indexer_config,
       downloader_config,
       connection_config,
       extension=self.__data.extension)
-    current_indexer_process = pipeline.indexer_step.process
-    pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
+    #current_indexer_process = pipeline.indexer_step.process
+    #pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
     pipeline.run()
   async def load(self) -> list[Document]:
       await asyncio.to_thread(self.run)

ws_bom_robot_app/llm/vector_store/integration/shopify.py ADDED Viewed

@@ -0,0 +1,143 @@
+import asyncio, logging, aiohttp
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import List, Union, Optional
+from pydantic import BaseModel, Field, AliasChoices, field_validator
+import json
+import os
+class ShopifyParams(BaseModel):
+  """
+  ShopifyParams is a model that defines the parameters required for Shopify integration.
+  Attributes:
+    shop_name (str): The shop name for Shopify.
+    access_token (str): The access token for Shopify.
+    graphql_query (Union[str, dict]): The GraphQL query string or dict for Shopify.
+  """
+  shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
+  access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
+  graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
+  @field_validator('graphql_query')
+  @classmethod
+  def extract_query_string(cls, v):
+    """Extract the query string from dict format if needed"""
+    if isinstance(v, dict) and 'query' in v:
+      return v['query']
+    return v
+class Shopify(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = ShopifyParams.model_validate(self.data)
+  def working_subdirectory(self) -> str:
+    return 'shopify'
+  async def run(self) -> None:
+    _data = await self.__get_data()
+    json_file_path = os.path.join(self.working_directory, 'shopify_data.json')
+    with open(json_file_path, 'w', encoding='utf-8') as f:
+      json.dump(_data, f, ensure_ascii=False)
+  async def load(self) -> list[Document]:
+    await self.run()
+    await asyncio.sleep(1)
+    return await Loader(self.working_directory).load()
+  async def __get_data(self, page_size: int = 50) -> List[dict]:
+      # URL dell'API
+    url = f"https://{self.__data.shop_name}.myshopify.com/admin/api/2024-07/graphql.json"
+    # Headers
+    headers = {
+        "X-Shopify-Access-Token": self.__data.access_token,
+        "Content-Type": "application/json"
+    }
+    all_products: List[dict] = []
+    has_next_page = True
+    cursor = None
+    retry_count = 0
+    max_retries = 5
+    while has_next_page:
+        # Variables per la query
+        variables = {
+            "first": page_size
+        }
+        if cursor:
+            variables["after"] = cursor
+        # Payload della richiesta
+        payload = {
+            "query": self.__data.graphql_query,
+            "variables": variables
+        }
+        try:
+            # Effettua la richiesta
+            async with aiohttp.ClientSession() as session:
+              async with session.post(url, headers=headers, json=payload) as response:
+                # Controlla se la risposta è JSON
+                try:
+                  data = await response.json()
+                except aiohttp.ContentTypeError:
+                  text = await response.text()
+                  logging.error(f"Non-JSON response received. Status code: {response.status}")
+                  logging.error(f"Content: {text}")
+                  raise Exception("Invalid response from API")
+            # Gestione del throttling
+            if "errors" in data:
+                error = data["errors"][0]
+                if error.get("extensions", {}).get("code") == "THROTTLED":
+                    retry_count += 1
+                    if retry_count > max_retries:
+                        raise Exception("Too many throttling attempts. Stopping execution.")
+                    # Aspetta un po' più a lungo ad ogni tentativo
+                    wait_time = 2 ** retry_count  # Backoff esponenziale
+                    print(f"Rate limit reached. Waiting {wait_time} seconds... (Attempt {retry_count}/{max_retries})")
+                    await asyncio.sleep(wait_time)
+                    continue
+                else:
+                    raise Exception(f"GraphQL errors: {data['errors']}")
+            # Resetta il contatore dei retry se la richiesta è andata bene
+            retry_count = 0
+            # Estrae i dati
+            products_data = data["data"]["products"]
+            edges = products_data["edges"]
+            page_info = products_data["pageInfo"]
+            # Aggiungi i prodotti alla lista
+            for edge in edges:
+                all_products.append(edge["node"])
+            # Aggiorna il cursore e il flag per la paginazione
+            has_next_page = page_info["hasNextPage"]
+            cursor = page_info["endCursor"]
+            print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_products)}")
+            # Piccola pausa per evitare di saturare l'API
+            await asyncio.sleep(0.1)
+        except aiohttp.ClientError as e:
+            logging.error(f"Connection error: {e}")
+            retry_count += 1
+            if retry_count <= max_retries:
+                wait_time = 2 ** retry_count
+                logging.warning(f"Retrying in {wait_time} seconds...")
+                await asyncio.sleep(wait_time)
+                continue
+            else:
+                raise Exception("Too many network errors. Stopping execution.")
+    logging.info(f"Data retrieval completed! Total products: {len(all_products)}")
+    return all_products

ws_bom_robot_app/llm/vector_store/integration/sitemap.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import sys, asyncio
 from typing import Any, AsyncGenerator, AsyncIterator
 import aiofiles
 import aiofiles.os
@@ -64,6 +65,8 @@ class Sitemap(IntegrationStrategy):
         return f"{self.knowledgebase_path}/{url}" if self._is_local(url) else url
     async def alazy_load(self,loader: SitemapLoader) -> AsyncIterator[Document]:
         """A lazy loader for Documents."""
+        if sys.platform == 'win32':
+          asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
         iterator = await run_in_executor(None, loader.lazy_load)
         done = object()
         while True:

ws_bom_robot_app/llm/vector_store/integration/slack.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
+from unstructured_ingest.interfaces.downloader import DownloaderConfig
+from unstructured_ingest.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Union
@@ -39,7 +40,7 @@ class Slack(IntegrationStrategy):
       start_date=datetime.now() - timedelta(days=self.__data.num_days),
       end_date=datetime.now()
     )
-    downloader_config = SlackDownloaderConfig(
+    downloader_config = DownloaderConfig(
       download_dir=self.working_directory
     )
     connection_config = SlackConnectionConfig(

ws_bom_robot_app/llm/vector_store/integration/thron.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import asyncio, logging, aiohttp
-from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import List, Union, Optional
@@ -54,7 +53,7 @@ class Thron(IntegrationStrategy):
           "accept": "application/json",
           "Content-Type": "application/x-www-form-urlencoded"
         }
-        async with session.post("https://websolute.thron.com/api/v1/authentication/oauth2/token", data=auth_data, headers=headers) as response:
+        async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/authentication/oauth2/token", data=auth_data, headers=headers) as response:
           result = await response.json()
           return result.get("access_token", "")
     except Exception as e:

ws_bom_robot_app/llm/vector_store/loader/base.py CHANGED Viewed

@@ -15,6 +15,8 @@ from langchain_community.document_loaders import (
     UnstructuredImageLoader,
     UnstructuredWordDocumentLoader,
     UnstructuredXMLLoader,
+    UnstructuredExcelLoader,
+    UnstructuredPDFLoader,
     UnstructuredPowerPointLoader,
     TextLoader
     )
@@ -30,9 +32,9 @@ class Loader():
   _list: dict[str, LoaderConfig | None] = {
     '.json': LoaderConfig(loader=JsonLoader),
-    '.csv': LoaderConfig(loader=CSVLoader),
+    '.csv': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":CSVLoader}),
     '.xls': None,
-    '.xlsx': LoaderConfig(loader=DoclingLoader),
+    '.xlsx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredExcelLoader, "strategy":"auto"}),
     '.eml': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
     '.msg': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
     '.epub': None,
@@ -47,9 +49,9 @@ class Loader():
     '.tsv': None,
     '.text': None,
     '.log': None,
-    '.htm': LoaderConfig(loader=BSHTMLLoader),
-    '.html': LoaderConfig(loader=BSHTMLLoader),
-    ".pdf": LoaderConfig(loader=DoclingLoader),
+    '.htm': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":BSHTMLLoader}),
+    '.html': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":BSHTMLLoader}),
+    ".pdf": LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredPDFLoader, "strategy":"auto"}),
     '.png': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
     '.jpg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
     '.jpeg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
@@ -59,7 +61,7 @@ class Loader():
     '.tiff': None,
     '.doc': None, #see liberoffice dependency
     '.docx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredWordDocumentLoader, "strategy":"auto"}),
-    '.xml': LoaderConfig(loader=UnstructuredXMLLoader,kwargs={"strategy":"auto"}),
+    '.xml': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredXMLLoader, "strategy":"auto"}),
     '.js': None,
     '.py': None,
     '.c': None,

ws_bom_robot_app/llm/vector_store/loader/docling.py CHANGED Viewed

@@ -17,7 +17,7 @@ class DoclingLoader(BaseLoader):
             )),
             InputFormat.IMAGE: ImageFormatOption(
               pipeline_options=PdfPipelineOptions(
-                ocr_options=TesseractCliOcrOptions(lang=["auto"]),
+                #ocr_options=TesseractCliOcrOptions(lang=["auto"]), #default to easyOcr
                 table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
             ))
         })

ws_bom_robot_app/subprocess_runner.py ADDED Viewed

@@ -0,0 +1,103 @@
+import logging
+import multiprocessing as mp
+from multiprocessing.connection import Connection
+import dill as _pickler
+import types, traceback
+import asyncio, sys
+from ws_bom_robot_app.config import config
+def _worker_run_pickled(serialized_task: bytes, conn: Connection):
+    """
+    Unpickle the object (should be an awaitable or callable), run it inside its own asyncio loop,
+    capture return value or exception and send back via conn.send((ok_flag, payload_serialized)).
+    This runs in a separate process and must be top-level for multiprocessing.
+    """
+    try:
+        if _pickler is None:
+            raise RuntimeError("No pickler available in worker process.")
+        obj = _pickler.loads(serialized_task)
+        # If obj is a coroutine object, run directly; if it's a callable, call it and maybe await result.
+        async def _wrap_and_run(o):
+            if asyncio.iscoroutine(o):
+                return await o
+            elif isinstance(o, types.FunctionType) or callable(o):
+                # call it; if returns coroutine, await it
+                result = o()
+                if asyncio.iscoroutine(result):
+                    return await result
+                return result
+            else:
+                # not callable / awaitable
+                return o
+        # Run inside asyncio.run (fresh loop)
+        result = asyncio.run(_wrap_and_run(obj))
+        # try to pickle result for sending, if fails, str() it
+        try:
+            payload = _pickler.dumps(("ok", result))
+        except Exception:
+            payload = _pickler.dumps(("ok", str(result)))
+        conn.send_bytes(payload)
+    except Exception as e:
+        # send back the error details
+        try:
+            tb = traceback.format_exc()
+            payload = _pickler.dumps(("err", {"error": str(e), "traceback": tb}))
+            conn.send_bytes(payload)
+        except Exception:
+            # last resort: send plain text
+            try:
+                conn.send_bytes(b'ERR:' + str(e).encode("utf-8"))
+            except Exception:
+                pass
+    finally:
+        try:
+            conn.close()
+        except Exception:
+            pass
+async def _recv_from_connection_async(conn: Connection):
+    """
+    Blocking recv wrapped for asyncio using a threadpool.
+    We expect worker to use conn.send_bytes(payload) — we use conn.recv_bytes() to get bytes.
+    """
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, conn.recv_bytes)  # blocking call inside executor
+def _start_subprocess_for_coroutine(coroutine_obj):
+    """
+    Try to start a subprocess that will run the provided coroutine/callable.
+    Returns tuple (process, parent_conn, used_subprocess_flag)
+    If cannot serialize, returns (None, None, False)
+    """
+    def _get_mp_start_method():
+        """Get the multiprocessing start method.
+        For Windows + Jupyter compatibility, 'spawn' is required
+        'spawn' guarantees that every worker starts fresh and doesn't carry Python heap or native allocations from the parent.
+        'fork' to get faster startup and lower initial memory cost, carries over everything in parent memory, including global variables and open resources: can be unsafe with threads, async loops
+        Returns:
+            str: The multiprocessing start method.
+        """
+        if sys.platform == "win32":
+            return "spawn"
+        return config.robot_task_mp_method
+    try:
+        serialized = _pickler.dumps(coroutine_obj)
+    except Exception:
+        # cannot serialize the coroutine/callable -> fall back to in-process
+        return (None, None, False)
+    parent_conn, child_conn = mp.Pipe(duplex=False)
+    ctx = mp.get_context(_get_mp_start_method())
+    p = ctx.Process(target=_worker_run_pickled, args=(serialized, child_conn), daemon=False)
+    p.start()
+    # close child conn in parent process
+    try:
+        child_conn.close()
+    except Exception:
+        pass
+    return (p, parent_conn, True)

ws-bom-robot-app 0.0.80__py3-none-any.whl → 0.0.82__py3-none-any.whl

ws-bom-robot-app 0.0.80py3-none-any.whl → 0.0.82py3-none-any.whl