PyPI - ws-bom-robot-app - Versions diffs - 0.0.21__tar.gz → 0.0.23__tar.gz - Mend

ws-bom-robot-app 0.0.21tar.gz → 0.0.23tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ws_bom_robot_app
-Version: 0.0.21
+Version: 0.0.23
 Summary: A FastAPI application serving ws bom/robot/llm platform ai.
 Home-page: https://github.com/websolutespa/bom
 Author: Websolute Spa
@@ -23,12 +23,19 @@ Requires-Dist: langchain-core==0.3.21
 Requires-Dist: faiss-cpu==1.9.0
 Requires-Dist: python-magic==0.4.27
 Requires-Dist: opencv-python-headless==4.10.0.84
-Requires-Dist: unstructured[all-docs]==0.15.14
+Requires-Dist: unstructured[all-docs]==0.16.11
 Requires-Dist: langchain_unstructured==0.1.5
 Requires-Dist: unstructured-ingest==0.3.8
+Requires-Dist: unstructured-ingest[azure]
 Requires-Dist: unstructured-ingest[confluence]
+Requires-Dist: unstructured-ingest[dropbox]
+Requires-Dist: unstructured-ingest[gcs]
+Requires-Dist: unstructured-ingest[google_drive]
 Requires-Dist: unstructured-ingest[github]
 Requires-Dist: unstructured-ingest[jira]
+Requires-Dist: unstructured-ingest[s3]
+Requires-Dist: unstructured-ingest[slack]
+Requires-Dist: unstructured-ingest[sftp]
 Requires-Dist: html5lib==1.1
 Requires-Dist: markdownify==0.14.1
 Requires-Dist: nebuly==0.3.33
@@ -207,6 +214,13 @@ launch debugger
 streamlit run debugger.py --server.port 6002
 ```
+dockerize app from src
+```pwsh
+docker build -f Dockerfile-src -t ws-bom-robot-app:src .
+docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
+```
 ### ✈️ publish
 - [testpypi](https://test.pypi.org/project/ws-bom-robot-app/)

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/README.md RENAMED Viewed

@@ -172,6 +172,13 @@ launch debugger
 streamlit run debugger.py --server.port 6002
 ```
+dockerize app from src
+```pwsh
+docker build -f Dockerfile-src -t ws-bom-robot-app:src .
+docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
+```
 ### ✈️ publish
 - [testpypi](https://test.pypi.org/project/ws-bom-robot-app/)

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="ws_bom_robot_app",
-    version="0.0.21",
+    version="0.0.23",
     description="A FastAPI application serving ws bom/robot/llm platform ai.",
     long_description=open("README.md", encoding='utf-8').read(),
     long_description_content_type="text/markdown",

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/azure.py ADDED Viewed

@@ -0,0 +1,62 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union, Optional
+from pydantic import BaseModel, Field, AliasChoices
+class AzureParams(BaseModel):
+  """
+  AzureParams is a model that holds configuration parameters for connecting to Azure services.
+  Attributes:
+    remote_url (str): The URL of the remote Azure service, in the form az://<container> or az://<container>/<path> for sub-folders.
+    account_name (str): The name of the Azure storage account.
+    \nProvide one of the following:
+      - account_key (Optional[str]): The key for the Azure storage account. Default is None.
+      - connection_string (Optional[str]): The connection string for the Azure storage account. Default is None.
+      - sas_token (Optional[str]): The Shared Access Signature token for the Azure storage account. Default is None. Detail: https://learn.microsoft.com/en-us/azure/ai-services/translator/document-translation/how-to-guides/create-sas-tokens?tabs=Containers
+    recursive (bool): Indicates whether the operation should be recursive. Default is False.
+    extension (list[str]): A list of file extensions to filter the files. Default is None.
+  """
+  remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
+  account_name: str = Field(validation_alias=AliasChoices("accountName","account_name"))
+  account_key: Optional[str] = Field(default=None,validation_alias=AliasChoices("accountKey","account_key"))
+  connection_string: Optional[str]  = Field(default=None,validation_alias=AliasChoices("connectionString","connection_string"))
+  sas_token: Optional[str]  = Field(default=None,validation_alias=AliasChoices("sasToken","sas_token"))
+  recursive: bool = False
+  extension: list[str] = Field(default=None)
+class Azure(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = AzureParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 'azure'
+  def run(self) -> None:
+    indexer_config = AzureIndexerConfig(
+      remote_url=self.__data.remote_url,
+      recursive=self.__data.recursive,
+      #sample_n_files=1
+    )
+    downloader_config = AzureDownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = AzureConnectionConfig(
+      access_config=AzureAccessConfig(
+        account_name=self.__data.account_name,
+        account_key=self.__data.account_key,
+        connection_string=self.__data.connection_string,
+        sas_token=self.__data.sas_token
+        )
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.extension).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/base.py ADDED Viewed

@@ -0,0 +1,43 @@
+import os
+from langchain_core.documents import Document
+from abc import ABC, abstractmethod
+from unstructured_ingest.v2.interfaces import ProcessorConfig
+from unstructured_ingest.v2.pipeline.pipeline import Pipeline, PartitionerConfig, FiltererConfig
+from typing import Union
+class IntegrationStrategy(ABC):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    self.knowledgebase_path = knowledgebase_path
+    self.data = data
+    self.working_directory = os.path.join(self.knowledgebase_path,self.working_subdirectory())
+    os.makedirs(self.working_directory, exist_ok=True)
+  @property
+  @abstractmethod
+  def working_subdirectory(self) -> str:
+    pass
+  @abstractmethod
+  #@timer
+  def load(self) -> list[Document]:
+    pass
+class UnstructuredIngest():
+  def __init__(self, working_directory: str):
+    self.working_directory = working_directory
+  def pipeline(self,indexer,downloader,connection,extension: list[str] = None) -> Pipeline:
+    return Pipeline.from_configs(
+      context=ProcessorConfig(
+        reprocess=False,
+        verbose=False,
+        tqdm=False,
+        num_processes=2,
+        preserve_downloads=True,
+        download_only=True,
+        raise_on_error=False
+      ),
+      indexer_config=indexer,
+      downloader_config=downloader,
+      source_connection_config=connection,
+      partitioner_config=PartitionerConfig(),
+      filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
+    )

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/confluence.py ADDED Viewed

@@ -0,0 +1,53 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union
+from pydantic import BaseModel, Field, AliasChoices
+class ConfluenceParams(BaseModel):
+  """
+  ConfluenceParams is a data model for storing Confluence integration parameters.
+  Attributes:
+    url (str): The URL of the Confluence instance, e.g., 'https://example.atlassian.net'.
+    access_token (str): The access token for authenticating with Confluence, e.g., 'AT....'
+    user_email (str): The email address of the Confluence user
+    spaces (list[str]): A list of Confluence spaces to interact with, e.g., ['SPACE1', 'SPACE2'].
+    extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g., ['.pdf', '.docx'].
+  """
+  url: str
+  access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
+  user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
+  spaces: list[str] = []
+  extension: list[str] = Field(default=None)
+class Confluence(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = ConfluenceParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 'confluence'
+  def run(self) -> None:
+    indexer_config = ConfluenceIndexerConfig(
+      spaces=self.__data.spaces
+    )
+    downloader_config = ConfluenceDownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = ConfluenceConnectionConfig(
+      access_config=ConfluenceAccessConfig(api_token=self.__data.access_token),
+      url=self.__data.url,
+      user_email=self.__data.user_email
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.extension).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/dropbox.py ADDED Viewed

@@ -0,0 +1,53 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union
+from pydantic import BaseModel, Field, AliasChoices
+class DropboxParams(BaseModel):
+  """
+  DropboxParams is a model for storing parameters required to interact with Dropbox.
+  Attributes:
+    remote_url (str): The URL of the remote Dropbox location, e.g. 'dropbox://demo-directory' or 'dropbox://demo-directory/sub-directory'.
+    token (str): The authentication token for accessing Dropbox.
+      create app: https://www.dropbox.com/developers, with file.content.read permission, and generate token.
+    recursive (bool, optional): A flag indicating whether to search directories recursively. Defaults to False.
+    extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g. ['.pdf', '.docx'].
+  """
+  remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
+  token: str
+  recursive: bool = False
+  extension: list[str] = Field(default=None)
+class Dropbox(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = DropboxParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 'dropbox'
+  def run(self) -> None:
+    indexer_config = DropboxIndexerConfig(
+      remote_url=self.__data.remote_url,
+      recursive=self.__data.recursive,
+      #sample_n_files=1
+    )
+    downloader_config = DropboxDownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = DropboxConnectionConfig(
+      access_config=DropboxAccessConfig(
+        token=self.__data.token
+        )
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.extension).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/gcs.py ADDED Viewed

@@ -0,0 +1,62 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union, Optional
+from pydantic import BaseModel, Field, AliasChoices
+class GcsParams(BaseModel):
+  """
+  GcsParams is a model that defines the parameters required for Google Cloud Storage (GCS) integration.
+  Documentation:
+    - create service account: https://cloud.google.com/iam/docs/service-accounts-create?hl=en#console
+    - create key: https://cloud.google.com/iam/docs/keys-create-delete?hl=en#creating
+    - export key in a single line\n
+    ```pwsh
+    (Get-Content -Path "<path-to-downloaded-key-file>" -Raw).Replace("`r`n", "").Replace("`n", "")
+    ```
+    - create bucket with 'Storage Object Viewer' permission: https://cloud.google.com/storage/docs/creating-buckets?hl=en#console
+    - add principal to bucket: https://cloud.google.com/storage/docs/access-control/using-iam-permissions?hl=en#console
+    - manage IAM policies: https://cloud.google.com/storage/docs/access-control/using-iam-permissions?hl=en
+  Attributes:
+    remote_url (str): The URL of the remote GCS bucket, e.g. 'gcs://demo-bucket' or 'gcs://demo-bucket/sub-directory'.
+    service_account_key (str): The service account key for accessing the GCS bucket.
+    recursive (bool): A flag indicating whether to recursively access the GCS bucket. Defaults to False.
+    extension (list[str]): A list of file extensions to filter the files in the GCS bucket. Defaults to None.
+  """
+  remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
+  service_account_key: str = Field(validation_alias=AliasChoices("serviceAccountKey","service_account_key"))
+  recursive: bool = False
+  extension: list[str] = Field(default=None)
+class Gcs(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = GcsParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 'gcs'
+  def run(self) -> None:
+    indexer_config = GcsIndexerConfig(
+      remote_url=self.__data.remote_url,
+      recursive=self.__data.recursive,
+      #sample_n_files=1
+    )
+    downloader_config = GcsDownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = GcsConnectionConfig(
+      access_config=GcsAccessConfig(
+        service_account_key=self.__data.service_account_key
+        )
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.extension).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/github.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 from typing import Optional, Union
-from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
+from unstructured_ingest.interfaces import  ProcessorConfig, ReadConfig
 from unstructured_ingest.connector.git import GitAccessConfig
 from unstructured_ingest.connector.github import SimpleGitHubConfig
 from unstructured_ingest.runner import GithubRunner
@@ -9,6 +10,15 @@ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from pydantic import BaseModel, Field, AliasChoices
 class GithubParams(BaseModel):
+  """
+  GithubParams is a model for storing parameters required to interact with a GitHub repository.
+  Attributes:
+    repo (str): The name of the GitHub repository, e.g., 'companyname/reponame'
+    access_token (Optional[str]): The access token for authenticating with GitHub, e.g., 'ghp_1234567890'.
+    branch (Optional[str]): The branch of the repository to interact with. Defaults to 'main'.
+    file_ext (Optional[list[str]]): A list of file extensions to filter by, e.g. ['.md', '.pdf']. Defaults to an empty list.
+  """
   repo: str
   access_token: Optional[str] | None = Field(None,validation_alias=AliasChoices("accessToken","access_token"))
   branch: Optional[str] = 'main'
@@ -17,8 +27,6 @@ class Github(IntegrationStrategy):
   def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
     super().__init__(knowledgebase_path, data)
     self.__data = GithubParams.model_validate(self.data)
-    self.__loader = Loader(self.working_directory)
-    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
   def working_subdirectory(self) -> str:
     return 'github'
   def run(self) -> None:
@@ -35,12 +43,13 @@ class Github(IntegrationStrategy):
     )
     runner = GithubRunner(
       connector_config=config,
-      processor_config=self.__unstructured_ingest.processor_config(),
-      read_config=self.__unstructured_ingest.read_config(),
-      partition_config=self.__unstructured_ingest.partition_config(),
-      retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
+      processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
+      read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
+      partition_config=None,
+      retry_strategy_config=None
       )
     runner.run()
   async def load(self) -> list[Document]:
       await asyncio.to_thread(self.run)
-      return await self.__loader.load()
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/googledrive.py ADDED Viewed

@@ -0,0 +1,69 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union
+from pydantic import BaseModel, Field, AliasChoices
+class GoogleDriveParams(BaseModel):
+  """
+  GoogleDriveParams is a model that holds parameters for Google Drive integration.
+  Attributes:
+    service_account_key (dict): The service account key for Google Drive API authentication \n
+      - detail: https://developers.google.com/workspace/guides/create-credentials#service-accountc \n
+      - create a service account key, download the JSON file, and pass the content of the JSON file as a dictionary \n
+      - e.g., {
+        "type": "service_account",
+        "project_id": "demo-project-123456",
+        "private_key_id": "**********",
+        "private_key": "-----BEGIN PRIVATE KEY-----...----END PRIVATE KEY-----",
+        "client_email": "demo-client@demo-project-123456.iam.gserviceaccount.com",
+        "client_id": "123456",
+        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+        "token_uri": "https://oauth2.googleapis.com/token",
+        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+        "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/demo-client%40demo-project-123456.iam.gserviceaccount.com",
+        "universe_domain": "googleapis.com"
+      }
+      - enable Google Drive API: https://console.cloud.google.com/marketplace/product/google/drive.googleapis.com
+      - copy email address of the service account and share the Google Drive with the email address: https://www.youtube.com/watch?v=ykJQzEe_2dM&t=2s
+    drive_id (str): The {folder_id} of the Google Drive to interact with, e.g., https://drive.google.com/drive/folders/{folder_id}
+    extensions (list[str]): A list of file extensions to filter the files in the Google Drive, e.g., ['.pdf', '.docx'].
+    recursive (bool): A flag indicating whether to search files recursively in the Google Drive.
+  """
+  service_account_key: dict = Field(validation_alias=AliasChoices("serviceAccountKey","service_account_key"))
+  drive_id: str = Field(validation_alias=AliasChoices("driveId","drive_id"))
+  extensions: list[str] = []
+  recursive: bool = False
+class GoogleDrive(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = GoogleDriveParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 'googledrive'
+  def run(self) -> None:
+    indexer_config = GoogleDriveIndexerConfig(
+      extensions=self.__data.extensions,
+      recursive=self.__data.recursive
+    )
+    downloader_config = GoogleDriveDownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = GoogleDriveConnectionConfig(
+      access_config=GoogleDriveAccessConfig(
+        service_account_key=self.__data.service_account_key
+        ),
+      drive_id=self.__data.drive_id
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/jira.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import asyncio
-from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
+from unstructured_ingest.interfaces import  ProcessorConfig, ReadConfig
 from unstructured_ingest.connector.jira import SimpleJiraConfig, JiraAccessConfig
 from unstructured_ingest.runner import JiraRunner
 from langchain_core.documents import Document
@@ -8,6 +9,17 @@ from pydantic import BaseModel, Field, AliasChoices
 from typing import Optional, Union
 class JiraParams(BaseModel):
+  """
+  JiraParams is a Pydantic model that represents the parameters required to interact with a Jira instance.
+  Attributes:
+    url (str): The URL of the Jira instance, e.g., 'https://example.atlassian.net'.
+    access_token (str): The access token for authenticating with the Jira API.
+    user_email (str): The email address of the Jira user.
+    projects (list[str]): A list of project keys or IDs to interact with, e.g., ['SCRUM', 'PROJ1'].
+    boards (Optional[list[str]]): An optional list of board IDs to interact with. Defaults to None, e.g., ['1', '2'].
+    issues (Optional[list[str]]): An optional list of issue keys or IDs to interact with. Defaults to None, e.g., ['SCRUM-1', 'PROJ1-1'].
+  """
   url: str
   access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
   user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
@@ -18,8 +30,6 @@ class Jira(IntegrationStrategy):
   def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
     super().__init__(knowledgebase_path, data)
     self.__data = JiraParams.model_validate(self.data)
-    self.__loader = Loader(self.working_directory)
-    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
   def working_subdirectory(self) -> str:
     return 'jira'
   def run(self) -> None:
@@ -36,13 +46,13 @@ class Jira(IntegrationStrategy):
     )
     runner = JiraRunner(
       connector_config=config,
-      processor_config=self.__unstructured_ingest.processor_config(),
-      read_config=self.__unstructured_ingest.read_config(),
-      partition_config=self.__unstructured_ingest.partition_config(),
-      retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
+      processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
+      read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
+      partition_config=None,
+      retry_strategy_config=None
       )
     runner.run()
   async def load(self) -> list[Document]:
       await asyncio.to_thread(self.run)
       await asyncio.sleep(1)
-      return await self.__loader.load()
+      return await Loader(self.working_directory).load()

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/manager.py RENAMED Viewed

@@ -1,16 +1,31 @@
 from typing import Type
+from ws_bom_robot_app.llm.vector_store.integration.azure import Azure
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
 from ws_bom_robot_app.llm.vector_store.integration.confluence import Confluence
+from ws_bom_robot_app.llm.vector_store.integration.dropbox import Dropbox
+from ws_bom_robot_app.llm.vector_store.integration.gcs import Gcs
 from ws_bom_robot_app.llm.vector_store.integration.github import Github
+from ws_bom_robot_app.llm.vector_store.integration.googledrive import GoogleDrive
 from ws_bom_robot_app.llm.vector_store.integration.jira import Jira
+from ws_bom_robot_app.llm.vector_store.integration.s3 import S3
+from ws_bom_robot_app.llm.vector_store.integration.sftp import Sftp
 from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
+from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
 class IntegrationManager:
   _list: dict[str, Type[IntegrationStrategy]] = {
-    "llmkbsitemap": Sitemap,
+    "llmkbazure": Azure,
+    "llmkbconfluence": Confluence,
+    "llmkbdropbox": Dropbox,
     "llmkbgithub": Github,
+    "llmkbgcs": Gcs,
+    "llmkbgoogledrive": GoogleDrive,
     "llmkbjira": Jira,
-    "llmkbconfluence": Confluence,
+    "llmkbs3": S3,
+    "llmkbsftp": Sftp,
+    "llmkbsitemap": Sitemap,
+    "llmkbslack": Slack,
   }
   @classmethod
   def get_strategy(cls, name: str, knowledgebase_path: str, data: dict[str, str]) -> IntegrationStrategy:

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/s3.py ADDED Viewed

@@ -0,0 +1,64 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union, Optional
+from pydantic import BaseModel, Field, AliasChoices
+class S3Params(BaseModel):
+  """
+  S3Params is a data model for storing parameters required to interact with an S3 bucket.
+  Documentation:
+    - ceate S3 bucket: https://docs.aws.amazon.com/AmazonS3/latest/userguide/GetStartedWithS3.html#creating-bucket
+    - enable authenticated bucket access: https://docs.aws.amazon.com/AmazonS3/latest/userguide/walkthrough1.html
+    - set policies s3:ListBucket and s3:GetObject: https://docs.aws.amazon.com/AmazonS3/latest/userguide/example-policies-s3.html
+    - generate key/secret: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html#Using_CreateAccessKey
+    - optionally create STS token: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_request.html#api_getsessiontoken
+  Attributes:
+    remote_url (str): The URL of the remote S3 bucket, e.g., 's3://demo-bucket' or 's3://demo-bucket/sub-directory'.
+    key (Optional[str]): The AWS access key ID for the authenticated AWS IAM user, e.g., 'AKIAIOSFODNN7EXAMPLE'.
+    secret (Optional[str]): The corresponding AWS secret access key, e.g., 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'.
+    token (Optional[str]):  If required, the AWS STS session token for temporary access. Default is None.
+    recursive (bool): A flag indicating whether to perform operations recursively. Default is False.
+    extension (list[str]): A list of file extensions to filter the files. Default is None. e.g., ['.pdf', '.docx'].
+  """
+  remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
+  key: str
+  secret: str
+  token: Optional[str]  = None
+  recursive: bool = False
+  extension: list[str] = Field(default=None)
+class S3(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = S3Params.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 's3'
+  def run(self) -> None:
+    indexer_config = S3IndexerConfig(
+      remote_url=self.__data.remote_url,
+      recursive=self.__data.recursive,
+      #sample_n_files=1
+    )
+    downloader_config = S3DownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = S3ConnectionConfig(
+      access_config=S3AccessConfig(
+        key=self.__data.key,
+        secret=self.__data.secret,
+        token=self.__data.token
+        )
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.extension).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/sftp.py ADDED Viewed

@@ -0,0 +1,64 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union, Optional
+from pydantic import BaseModel, Field, AliasChoices
+class SftpParams(BaseModel):
+  """
+  SftpParams is a model that defines the parameters required for SFTP integration.
+  Attributes:
+    remote_url (str): The URL of the remote SFTP server, e.g. 'sftp://example.com' or 'sftp://example.com/directory'.
+    host (Optional[str]): The hostname or IP address of the SFTP server. Defaults to None and inferred from remote_url
+    port (Optional[int]): The port number to connect to on the SFTP server. Defaults to 22.
+    username (str): The username to authenticate with the SFTP server.
+    password (str): The password to authenticate with the SFTP server.
+    recursive (bool): Whether to perform recursive operations. Defaults to False.
+    extension (list[str]): A list of file extensions to filter by. Defaults to None, e.g. ['.pdf', '.docx'].
+  """
+  remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
+  host: Optional[str] = None
+  port: Optional[int] = 22
+  username: str
+  password: str
+  recursive: bool = False
+  extension: list[str] = Field(default=None)
+class Sftp(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = SftpParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 'sftp'
+  def run(self) -> None:
+    indexer_config = SftpIndexerConfig(
+      remote_url=self.__data.remote_url,
+      recursive=self.__data.recursive,
+      #sample_n_files=1
+    )
+    downloader_config = SftpDownloaderConfig(
+      download_dir=self.working_directory,
+      remote_url=self.__data.remote_url
+    )
+    connection_config = SftpConnectionConfig(
+      access_config=SftpAccessConfig(
+        password=self.__data.password
+        ),
+      username=self.__data.username,
+      host=self.__data.host,
+      port=self.__data.port,
+      look_for_keys=False,
+      allow_agent=False
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.extension).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/slack.py ADDED Viewed

@@ -0,0 +1,57 @@
+import asyncio
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.v2.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from typing import Union
+from pydantic import BaseModel, Field, AliasChoices
+from datetime import datetime, timedelta
+class SlackParams(BaseModel):
+  """
+  SlackParams is a data model for storing Slack integration parameters.
+  Documentation:
+    - create slack app: https://api.slack.com/quickstart#creating
+    - set channels:history scope: https://api.slack.com/quickstart#scopes
+    - installing app/get token: https://api.slack.com/quickstart#installing
+    - add app to channel/s
+  Attributes:
+    token (str): The authentication token for accessing the Slack API.
+    channels (list[str]): A list of Slack channel IDs, e.g. ['C01B2PZQX1V'].
+    num_days (int, optional): The number of days to retrieve messages from. Defaults to 7.
+    extension (list[str], optional): A list of file extensions to filter messages by, e.g. [".xml"]. Defaults to None.
+  """
+  token: str
+  channels: list[str]
+  num_days: int = Field(default=7,validation_alias=AliasChoices("numDays","num_days"))
+  extension: list[str] = Field(default=None)
+class Slack(IntegrationStrategy):
+  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
+    super().__init__(knowledgebase_path, data)
+    self.__data = SlackParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
+  def working_subdirectory(self) -> str:
+    return 'slack'
+  def run(self) -> None:
+    indexer_config = SlackIndexerConfig(
+      channels=self.__data.channels,
+      start_date=datetime.now() - timedelta(days=self.__data.num_days),
+      end_date=datetime.now()
+    )
+    downloader_config = SlackDownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = SlackConnectionConfig(
+      access_config=SlackAccessConfig(token=self.__data.token)
+    )
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.extension).run()
+  async def load(self) -> list[Document]:
+      await asyncio.to_thread(self.run)
+      await asyncio.sleep(1)
+      return await Loader(self.working_directory).load()

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/loader/base.py RENAMED Viewed

@@ -1,15 +1,14 @@
-import asyncio
-from ws_bom_robot_app.config import config
-from typing import Any, Callable, Generator, Optional, Tuple
+import asyncio, gc, logging, os, traceback
+from typing import Any, Optional
 from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_community.document_loaders.merge import MergedDataLoader
 from langchain_core.documents import Document
 from langchain_unstructured import UnstructuredLoader
 from pydantic import BaseModel
+from ws_bom_robot_app.config import config
 from ws_bom_robot_app.llm.vector_store.loader.json_loader import JsonLoader
-import gc, logging
 class LoaderConfig(BaseModel):
   loader: type[BaseLoader]
@@ -94,7 +93,7 @@ class Loader():
     for loader_config in loader_configs.values():
         loaders.append(
           DirectoryLoader(
-            self.knowledgebase_path,
+            os.path.abspath(self.knowledgebase_path),
             glob=loader_config["glob_patterns"],
             loader_cls=loader_config["loader_cls"],
             loader_kwargs=loader_config["loader_kwargs"],
@@ -110,17 +109,23 @@ class Loader():
   #@timer
   async def load(self) -> list[Document]:
     MAX_RETRIES = 3
-    loaders = MergedDataLoader(self.__directory_loader())
+    loaders: MergedDataLoader = MergedDataLoader(self.__directory_loader())
     try:
       for attempt in range(MAX_RETRIES):
         try:
-          return await loaders.aload()
-          #return await [doc async for doc in loaders.alazy_load()]
+          _documents = []
+          async for document in loaders.alazy_load():
+            _documents.append(document)
+          return _documents
         except Exception as e:
           logging.warning(f"Attempt {attempt+1} load document  failed: {e}")
           await asyncio.sleep(1)
           if attempt == MAX_RETRIES - 1:
-            logging.error(f"Failed to load documents: {e}")
+            tb = traceback.format_exc()
+            logging.error(f"Failed to load documents: {e} | {tb}")
             return []
+        finally:
+           del _documents
     finally:
+      del loaders
       gc.collect()

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/main.py RENAMED Viewed

@@ -75,8 +75,14 @@ def diag(authenticate: bool = Depends(authenticate)):
     from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager as wsim
     from ws_bom_robot_app.llm.tools.tool_manager import ToolManager as wstm
     from ws_bom_robot_app.llm.agent_description import AgentDescriptor as wsad
     svmem = psutil.virtual_memory()
     swap = psutil.swap_memory()
+    try:
+      ws_bom_robot_app_version = pkg_resources.get_distribution("ws_bom_robot_app").version
+    except:
+      ws_bom_robot_app_version = "unknown"
+    peer_process_ids = [c.pid for c in psutil.Process(os.getppid()).children()] if config.runtime_options().is_multi_process else None
     return {
         "status":"ok",
         "uptime": {'from':_uptime,'elapsed':str(datetime.datetime.now()-_uptime)},
@@ -117,8 +123,9 @@ def diag(authenticate: bool = Depends(authenticate)):
             "os": {
                 "ppid": os.getppid(),
                 "pid": os.getpid(),
+                "pids": peer_process_ids,
                 "cwd": os.getcwd(),
-                "ws_bom_robot_app": pkg_resources.get_distribution("ws_bom_robot_app").version,
+                "ws_bom_robot_app": ws_bom_robot_app_version,
                 "env": os.environ,
             },
         },

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/requirements.txt RENAMED Viewed

@@ -20,12 +20,19 @@ faiss-cpu==1.9.0
 #loaders
 python-magic==0.4.27
 opencv-python-headless==4.10.0.84 #docker specs
-unstructured[all-docs]==0.15.14
+unstructured[all-docs]==0.16.11
 langchain_unstructured==0.1.5
 unstructured-ingest==0.3.8
+unstructured-ingest[azure]
 unstructured-ingest[confluence]
+unstructured-ingest[dropbox]
+unstructured-ingest[gcs]
+unstructured-ingest[google_drive]
 unstructured-ingest[github]
 unstructured-ingest[jira]
+unstructured-ingest[s3]
+unstructured-ingest[slack]
+unstructured-ingest[sftp]
 html5lib==1.1 #beautifulsoup4 parser
 #integrations

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/task_manager.py RENAMED Viewed

@@ -116,6 +116,7 @@ class TaskStatistics(BaseModel):
     class TaskStatisticExecutionInfo(BaseModel):
         retention_days: float = config.robot_task_retention_days
         max_concurrent: int
+        pid: int = os.getpid()
         running: list[TaskStatus]
         slowest: list
     class TaskStatisticExecutionTime(BaseModel):
@@ -169,7 +170,7 @@ class TaskManagerStrategy(ABC):
         pass
     def task_cleanup_rule(self, task: TaskEntry) -> bool:
-        return task.status.status in {"completed", "failure"} and datetime.fromisoformat(task.status.metadata.end_at) < datetime.now() - timedelta(days=config.robot_task_retention_days)
+        return task.status.metadata.start_at and datetime.fromisoformat(task.status.metadata.start_at) < datetime.now() - timedelta(days=config.robot_task_retention_days)
     def task_done_callback(self, task_entry: TaskEntry, headers: TaskHeader | None = None) -> Callable:
         def callback(task: asyncio.Task):

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ws_bom_robot_app
-Version: 0.0.21
+Version: 0.0.23
 Summary: A FastAPI application serving ws bom/robot/llm platform ai.
 Home-page: https://github.com/websolutespa/bom
 Author: Websolute Spa
@@ -23,12 +23,19 @@ Requires-Dist: langchain-core==0.3.21
 Requires-Dist: faiss-cpu==1.9.0
 Requires-Dist: python-magic==0.4.27
 Requires-Dist: opencv-python-headless==4.10.0.84
-Requires-Dist: unstructured[all-docs]==0.15.14
+Requires-Dist: unstructured[all-docs]==0.16.11
 Requires-Dist: langchain_unstructured==0.1.5
 Requires-Dist: unstructured-ingest==0.3.8
+Requires-Dist: unstructured-ingest[azure]
 Requires-Dist: unstructured-ingest[confluence]
+Requires-Dist: unstructured-ingest[dropbox]
+Requires-Dist: unstructured-ingest[gcs]
+Requires-Dist: unstructured-ingest[google_drive]
 Requires-Dist: unstructured-ingest[github]
 Requires-Dist: unstructured-ingest[jira]
+Requires-Dist: unstructured-ingest[s3]
+Requires-Dist: unstructured-ingest[slack]
+Requires-Dist: unstructured-ingest[sftp]
 Requires-Dist: html5lib==1.1
 Requires-Dist: markdownify==0.14.1
 Requires-Dist: nebuly==0.3.33
@@ -207,6 +214,13 @@ launch debugger
 streamlit run debugger.py --server.port 6002
 ```
+dockerize app from src
+```pwsh
+docker build -f Dockerfile-src -t ws-bom-robot-app:src .
+docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
+```
 ### ✈️ publish
 - [testpypi](https://test.pypi.org/project/ws-bom-robot-app/)

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/SOURCES.txt RENAMED Viewed

@@ -42,12 +42,19 @@ ws_bom_robot_app/llm/utils/webhooks.py
 ws_bom_robot_app/llm/vector_store/__init__.py
 ws_bom_robot_app/llm/vector_store/generator.py
 ws_bom_robot_app/llm/vector_store/integration/__init__.py
+ws_bom_robot_app/llm/vector_store/integration/azure.py
 ws_bom_robot_app/llm/vector_store/integration/base.py
 ws_bom_robot_app/llm/vector_store/integration/confluence.py
+ws_bom_robot_app/llm/vector_store/integration/dropbox.py
+ws_bom_robot_app/llm/vector_store/integration/gcs.py
 ws_bom_robot_app/llm/vector_store/integration/github.py
+ws_bom_robot_app/llm/vector_store/integration/googledrive.py
 ws_bom_robot_app/llm/vector_store/integration/jira.py
 ws_bom_robot_app/llm/vector_store/integration/manager.py
+ws_bom_robot_app/llm/vector_store/integration/s3.py
+ws_bom_robot_app/llm/vector_store/integration/sftp.py
 ws_bom_robot_app/llm/vector_store/integration/sitemap.py
+ws_bom_robot_app/llm/vector_store/integration/slack.py
 ws_bom_robot_app/llm/vector_store/loader/__init__.py
 ws_bom_robot_app/llm/vector_store/loader/base.py
 ws_bom_robot_app/llm/vector_store/loader/json_loader.py

{ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/requires.txt RENAMED Viewed

@@ -11,12 +11,19 @@ langchain-core==0.3.21
 faiss-cpu==1.9.0
 python-magic==0.4.27
 opencv-python-headless==4.10.0.84
-unstructured[all-docs]==0.15.14
+unstructured[all-docs]==0.16.11
 langchain_unstructured==0.1.5
 unstructured-ingest==0.3.8
+unstructured-ingest[azure]
 unstructured-ingest[confluence]
+unstructured-ingest[dropbox]
+unstructured-ingest[gcs]
+unstructured-ingest[google_drive]
 unstructured-ingest[github]
 unstructured-ingest[jira]
+unstructured-ingest[s3]
+unstructured-ingest[slack]
+unstructured-ingest[sftp]
 html5lib==1.1
 markdownify==0.14.1
 nebuly==0.3.33

ws_bom_robot_app-0.0.21/ws_bom_robot_app/llm/vector_store/integration/base.py DELETED Viewed

@@ -1,44 +0,0 @@
-import os
-from langchain_core.documents import Document
-from abc import ABC, abstractmethod
-from unstructured_ingest.interfaces import PartitionConfig,  ProcessorConfig, ReadConfig, RetryStrategyConfig
-from typing import Union
-class IntegrationStrategy(ABC):
-  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
-    self.knowledgebase_path = knowledgebase_path
-    self.data = data
-    self.working_directory = os.path.join(self.knowledgebase_path,self.working_subdirectory())
-    os.makedirs(self.working_directory, exist_ok=True)
-  @property
-  @abstractmethod
-  def working_subdirectory(self) -> str:
-    pass
-  @abstractmethod
-  #@timer
-  def load(self) -> list[Document]:
-    pass
-class UnstructuredIngest():
-  def __init__(self, working_directory: str):
-    self.working_directory = working_directory
-  def processor_config(self) -> ProcessorConfig:
-    return ProcessorConfig(
-      reprocess=False,
-      verbose=False,
-      work_dir=os.path.abspath(self.working_directory),
-      #output_dir=os.path.abspath(self.working_directory),
-      num_processes=1,
-      raise_on_error=False
-    )
-  def read_config(self) -> ReadConfig:
-    return ReadConfig(
-      download_dir=os.path.abspath(self.working_directory),
-      re_download=True,
-      preserve_downloads=True,
-      download_only=True
-    )
-  def partition_config(self) -> PartitionConfig:
-    return None
-  def retry_strategy_config(self) -> RetryStrategyConfig:
-    return None

ws_bom_robot_app-0.0.21/ws_bom_robot_app/llm/vector_store/integration/confluence.py DELETED Viewed

@@ -1,47 +0,0 @@
-import asyncio
-from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.connector.confluence import SimpleConfluenceConfig, ConfluenceAccessConfig
-from unstructured_ingest.runner import ConfluenceRunner
-from langchain_core.documents import Document
-from ws_bom_robot_app.llm.vector_store.loader.base import Loader
-from typing import Optional, Union
-from pydantic import BaseModel, Field, AliasChoices
-class ConfluenceParams(BaseModel):
-  url: str
-  access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
-  user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
-  spaces: list[str] = []
-class Confluence(IntegrationStrategy):
-  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
-    super().__init__(knowledgebase_path, data)
-    self.__data = ConfluenceParams.model_validate(self.data)
-    self.__loader = Loader(self.working_directory)
-    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
-  def working_subdirectory(self) -> str:
-    return 'confluence'
-  def run(self) -> None:
-    access_config = ConfluenceAccessConfig(
-      api_token=self.__data.access_token
-    )
-    config = SimpleConfluenceConfig(
-      user_email=self.__data.user_email,
-      url = self.__data.url,
-      access_config=access_config,
-      #max_num_of_spaces=self.data.get('max_num_of_spaces',500),
-      #max_num_of_docs_from_each_space=self.data.get('max_num_of_docs_from_each_space',100),
-      spaces=self.__data.spaces
-    )
-    runner = ConfluenceRunner(
-      connector_config=config,
-      processor_config=self.__unstructured_ingest.processor_config(),
-      read_config=self.__unstructured_ingest.read_config(),
-      partition_config=self.__unstructured_ingest.partition_config(),
-      retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
-      )
-    runner.run()
-  async def load(self) -> list[Document]:
-      await asyncio.to_thread(self.run)
-      await asyncio.sleep(1)
-      return await self.__loader.load()