ws-bom-robot-app 0.0.21__tar.gz → 0.0.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/PKG-INFO +16 -2
  2. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/README.md +7 -0
  3. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/setup.py +1 -1
  4. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/azure.py +62 -0
  5. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/base.py +43 -0
  6. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/confluence.py +53 -0
  7. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/dropbox.py +53 -0
  8. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/gcs.py +62 -0
  9. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/github.py +17 -8
  10. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/googledrive.py +69 -0
  11. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/jira.py +18 -8
  12. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/manager.py +17 -2
  13. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/s3.py +64 -0
  14. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/sftp.py +64 -0
  15. ws_bom_robot_app-0.0.23/ws_bom_robot_app/llm/vector_store/integration/slack.py +57 -0
  16. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/loader/base.py +14 -9
  17. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/main.py +8 -1
  18. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/requirements.txt +8 -1
  19. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/task_manager.py +2 -1
  20. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/PKG-INFO +16 -2
  21. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/SOURCES.txt +7 -0
  22. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/requires.txt +8 -1
  23. ws_bom_robot_app-0.0.21/ws_bom_robot_app/llm/vector_store/integration/base.py +0 -44
  24. ws_bom_robot_app-0.0.21/ws_bom_robot_app/llm/vector_store/integration/confluence.py +0 -47
  25. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/pyproject.toml +0 -0
  26. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/setup.cfg +0 -0
  27. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/__init__.py +0 -0
  28. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/auth.py +0 -0
  29. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/config.py +0 -0
  30. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/cron_manager.py +0 -0
  31. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/__init__.py +0 -0
  32. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/agent_description.py +0 -0
  33. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/agent_handler.py +0 -0
  34. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/agent_lcel.py +0 -0
  35. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/api.py +0 -0
  36. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/defaut_prompt.py +0 -0
  37. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/main.py +0 -0
  38. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/models/__init__.py +0 -0
  39. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/models/api.py +0 -0
  40. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/models/base.py +0 -0
  41. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/models/kb.py +0 -0
  42. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/settings.py +0 -0
  43. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/tools/__init__.py +0 -0
  44. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/tools/models/__init__.py +0 -0
  45. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/tools/models/main.py +0 -0
  46. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/tools/tool_builder.py +0 -0
  47. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/tools/tool_manager.py +0 -0
  48. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/tools/utils.py +0 -0
  49. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/utils/__init__.py +0 -0
  50. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/utils/agent_utils.py +0 -0
  51. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/utils/download.py +0 -0
  52. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/utils/faiss_helper.py +0 -0
  53. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/utils/kb.py +0 -0
  54. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/utils/print.py +0 -0
  55. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/utils/webhooks.py +0 -0
  56. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/__init__.py +0 -0
  57. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/generator.py +0 -0
  58. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/__init__.py +0 -0
  59. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/integration/sitemap.py +0 -0
  60. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/loader/__init__.py +0 -0
  61. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/llm/vector_store/loader/json_loader.py +0 -0
  62. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app/util.py +0 -0
  63. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/dependency_links.txt +0 -0
  64. {ws_bom_robot_app-0.0.21 → ws_bom_robot_app-0.0.23}/ws_bom_robot_app.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.21
3
+ Version: 0.0.23
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -23,12 +23,19 @@ Requires-Dist: langchain-core==0.3.21
23
23
  Requires-Dist: faiss-cpu==1.9.0
24
24
  Requires-Dist: python-magic==0.4.27
25
25
  Requires-Dist: opencv-python-headless==4.10.0.84
26
- Requires-Dist: unstructured[all-docs]==0.15.14
26
+ Requires-Dist: unstructured[all-docs]==0.16.11
27
27
  Requires-Dist: langchain_unstructured==0.1.5
28
28
  Requires-Dist: unstructured-ingest==0.3.8
29
+ Requires-Dist: unstructured-ingest[azure]
29
30
  Requires-Dist: unstructured-ingest[confluence]
31
+ Requires-Dist: unstructured-ingest[dropbox]
32
+ Requires-Dist: unstructured-ingest[gcs]
33
+ Requires-Dist: unstructured-ingest[google_drive]
30
34
  Requires-Dist: unstructured-ingest[github]
31
35
  Requires-Dist: unstructured-ingest[jira]
36
+ Requires-Dist: unstructured-ingest[s3]
37
+ Requires-Dist: unstructured-ingest[slack]
38
+ Requires-Dist: unstructured-ingest[sftp]
32
39
  Requires-Dist: html5lib==1.1
33
40
  Requires-Dist: markdownify==0.14.1
34
41
  Requires-Dist: nebuly==0.3.33
@@ -207,6 +214,13 @@ launch debugger
207
214
  streamlit run debugger.py --server.port 6002
208
215
  ```
209
216
 
217
+ dockerize app from src
218
+
219
+ ```pwsh
220
+ docker build -f Dockerfile-src -t ws-bom-robot-app:src .
221
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
222
+ ```
223
+
210
224
  ### ✈️ publish
211
225
 
212
226
  - [testpypi](https://test.pypi.org/project/ws-bom-robot-app/)
@@ -172,6 +172,13 @@ launch debugger
172
172
  streamlit run debugger.py --server.port 6002
173
173
  ```
174
174
 
175
+ dockerize app from src
176
+
177
+ ```pwsh
178
+ docker build -f Dockerfile-src -t ws-bom-robot-app:src .
179
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
180
+ ```
181
+
175
182
  ### ✈️ publish
176
183
 
177
184
  - [testpypi](https://test.pypi.org/project/ws-bom-robot-app/)
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="ws_bom_robot_app",
5
- version="0.0.21",
5
+ version="0.0.23",
6
6
  description="A FastAPI application serving ws bom/robot/llm platform ai.",
7
7
  long_description=open("README.md", encoding='utf-8').read(),
8
8
  long_description_content_type="text/markdown",
@@ -0,0 +1,62 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union, Optional
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+ class AzureParams(BaseModel):
9
+ """
10
+ AzureParams is a model that holds configuration parameters for connecting to Azure services.
11
+
12
+ Attributes:
13
+ remote_url (str): The URL of the remote Azure service, in the form az://<container> or az://<container>/<path> for sub-folders.
14
+ account_name (str): The name of the Azure storage account.
15
+ \nProvide one of the following:
16
+ - account_key (Optional[str]): The key for the Azure storage account. Default is None.
17
+ - connection_string (Optional[str]): The connection string for the Azure storage account. Default is None.
18
+ - sas_token (Optional[str]): The Shared Access Signature token for the Azure storage account. Default is None. Detail: https://learn.microsoft.com/en-us/azure/ai-services/translator/document-translation/how-to-guides/create-sas-tokens?tabs=Containers
19
+ recursive (bool): Indicates whether the operation should be recursive. Default is False.
20
+ extension (list[str]): A list of file extensions to filter the files. Default is None.
21
+ """
22
+ remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
23
+ account_name: str = Field(validation_alias=AliasChoices("accountName","account_name"))
24
+ account_key: Optional[str] = Field(default=None,validation_alias=AliasChoices("accountKey","account_key"))
25
+ connection_string: Optional[str] = Field(default=None,validation_alias=AliasChoices("connectionString","connection_string"))
26
+ sas_token: Optional[str] = Field(default=None,validation_alias=AliasChoices("sasToken","sas_token"))
27
+ recursive: bool = False
28
+ extension: list[str] = Field(default=None)
29
+ class Azure(IntegrationStrategy):
30
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
31
+ super().__init__(knowledgebase_path, data)
32
+ self.__data = AzureParams.model_validate(self.data)
33
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
34
+ def working_subdirectory(self) -> str:
35
+ return 'azure'
36
+ def run(self) -> None:
37
+ indexer_config = AzureIndexerConfig(
38
+ remote_url=self.__data.remote_url,
39
+ recursive=self.__data.recursive,
40
+ #sample_n_files=1
41
+ )
42
+ downloader_config = AzureDownloaderConfig(
43
+ download_dir=self.working_directory
44
+ )
45
+ connection_config = AzureConnectionConfig(
46
+ access_config=AzureAccessConfig(
47
+ account_name=self.__data.account_name,
48
+ account_key=self.__data.account_key,
49
+ connection_string=self.__data.connection_string,
50
+ sas_token=self.__data.sas_token
51
+ )
52
+ )
53
+ self.__unstructured_ingest.pipeline(
54
+ indexer_config,
55
+ downloader_config,
56
+ connection_config,
57
+ extension=self.__data.extension).run()
58
+ async def load(self) -> list[Document]:
59
+ await asyncio.to_thread(self.run)
60
+ await asyncio.sleep(1)
61
+ return await Loader(self.working_directory).load()
62
+
@@ -0,0 +1,43 @@
1
+ import os
2
+ from langchain_core.documents import Document
3
+ from abc import ABC, abstractmethod
4
+ from unstructured_ingest.v2.interfaces import ProcessorConfig
5
+ from unstructured_ingest.v2.pipeline.pipeline import Pipeline, PartitionerConfig, FiltererConfig
6
+ from typing import Union
7
+
8
+ class IntegrationStrategy(ABC):
9
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
10
+ self.knowledgebase_path = knowledgebase_path
11
+ self.data = data
12
+ self.working_directory = os.path.join(self.knowledgebase_path,self.working_subdirectory())
13
+ os.makedirs(self.working_directory, exist_ok=True)
14
+ @property
15
+ @abstractmethod
16
+ def working_subdirectory(self) -> str:
17
+ pass
18
+ @abstractmethod
19
+ #@timer
20
+ def load(self) -> list[Document]:
21
+ pass
22
+
23
+ class UnstructuredIngest():
24
+ def __init__(self, working_directory: str):
25
+ self.working_directory = working_directory
26
+ def pipeline(self,indexer,downloader,connection,extension: list[str] = None) -> Pipeline:
27
+ return Pipeline.from_configs(
28
+ context=ProcessorConfig(
29
+ reprocess=False,
30
+ verbose=False,
31
+ tqdm=False,
32
+ num_processes=2,
33
+ preserve_downloads=True,
34
+ download_only=True,
35
+ raise_on_error=False
36
+ ),
37
+ indexer_config=indexer,
38
+ downloader_config=downloader,
39
+ source_connection_config=connection,
40
+ partitioner_config=PartitionerConfig(),
41
+ filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
42
+ )
43
+
@@ -0,0 +1,53 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+
9
+ class ConfluenceParams(BaseModel):
10
+ """
11
+ ConfluenceParams is a data model for storing Confluence integration parameters.
12
+
13
+ Attributes:
14
+ url (str): The URL of the Confluence instance, e.g., 'https://example.atlassian.net'.
15
+ access_token (str): The access token for authenticating with Confluence, e.g., 'AT....'
16
+ user_email (str): The email address of the Confluence user
17
+ spaces (list[str]): A list of Confluence spaces to interact with, e.g., ['SPACE1', 'SPACE2'].
18
+ extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g., ['.pdf', '.docx'].
19
+ """
20
+ url: str
21
+ access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
22
+ user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
23
+ spaces: list[str] = []
24
+ extension: list[str] = Field(default=None)
25
+ class Confluence(IntegrationStrategy):
26
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
27
+ super().__init__(knowledgebase_path, data)
28
+ self.__data = ConfluenceParams.model_validate(self.data)
29
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
30
+ def working_subdirectory(self) -> str:
31
+ return 'confluence'
32
+ def run(self) -> None:
33
+ indexer_config = ConfluenceIndexerConfig(
34
+ spaces=self.__data.spaces
35
+ )
36
+ downloader_config = ConfluenceDownloaderConfig(
37
+ download_dir=self.working_directory
38
+ )
39
+ connection_config = ConfluenceConnectionConfig(
40
+ access_config=ConfluenceAccessConfig(api_token=self.__data.access_token),
41
+ url=self.__data.url,
42
+ user_email=self.__data.user_email
43
+ )
44
+ self.__unstructured_ingest.pipeline(
45
+ indexer_config,
46
+ downloader_config,
47
+ connection_config,
48
+ extension=self.__data.extension).run()
49
+ async def load(self) -> list[Document]:
50
+ await asyncio.to_thread(self.run)
51
+ await asyncio.sleep(1)
52
+ return await Loader(self.working_directory).load()
53
+
@@ -0,0 +1,53 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+ class DropboxParams(BaseModel):
9
+ """
10
+ DropboxParams is a model for storing parameters required to interact with Dropbox.
11
+
12
+ Attributes:
13
+ remote_url (str): The URL of the remote Dropbox location, e.g. 'dropbox://demo-directory' or 'dropbox://demo-directory/sub-directory'.
14
+ token (str): The authentication token for accessing Dropbox.
15
+ create app: https://www.dropbox.com/developers, with file.content.read permission, and generate token.
16
+ recursive (bool, optional): A flag indicating whether to search directories recursively. Defaults to False.
17
+ extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g. ['.pdf', '.docx'].
18
+ """
19
+ remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
20
+ token: str
21
+ recursive: bool = False
22
+ extension: list[str] = Field(default=None)
23
+ class Dropbox(IntegrationStrategy):
24
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
25
+ super().__init__(knowledgebase_path, data)
26
+ self.__data = DropboxParams.model_validate(self.data)
27
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
28
+ def working_subdirectory(self) -> str:
29
+ return 'dropbox'
30
+ def run(self) -> None:
31
+ indexer_config = DropboxIndexerConfig(
32
+ remote_url=self.__data.remote_url,
33
+ recursive=self.__data.recursive,
34
+ #sample_n_files=1
35
+ )
36
+ downloader_config = DropboxDownloaderConfig(
37
+ download_dir=self.working_directory
38
+ )
39
+ connection_config = DropboxConnectionConfig(
40
+ access_config=DropboxAccessConfig(
41
+ token=self.__data.token
42
+ )
43
+ )
44
+ self.__unstructured_ingest.pipeline(
45
+ indexer_config,
46
+ downloader_config,
47
+ connection_config,
48
+ extension=self.__data.extension).run()
49
+ async def load(self) -> list[Document]:
50
+ await asyncio.to_thread(self.run)
51
+ await asyncio.sleep(1)
52
+ return await Loader(self.working_directory).load()
53
+
@@ -0,0 +1,62 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union, Optional
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+ class GcsParams(BaseModel):
9
+ """
10
+ GcsParams is a model that defines the parameters required for Google Cloud Storage (GCS) integration.
11
+ Documentation:
12
+ - create service account: https://cloud.google.com/iam/docs/service-accounts-create?hl=en#console
13
+ - create key: https://cloud.google.com/iam/docs/keys-create-delete?hl=en#creating
14
+ - export key in a single line\n
15
+ ```pwsh
16
+ (Get-Content -Path "<path-to-downloaded-key-file>" -Raw).Replace("`r`n", "").Replace("`n", "")
17
+ ```
18
+ - create bucket with 'Storage Object Viewer' permission: https://cloud.google.com/storage/docs/creating-buckets?hl=en#console
19
+ - add principal to bucket: https://cloud.google.com/storage/docs/access-control/using-iam-permissions?hl=en#console
20
+ - manage IAM policies: https://cloud.google.com/storage/docs/access-control/using-iam-permissions?hl=en
21
+
22
+ Attributes:
23
+ remote_url (str): The URL of the remote GCS bucket, e.g. 'gcs://demo-bucket' or 'gcs://demo-bucket/sub-directory'.
24
+ service_account_key (str): The service account key for accessing the GCS bucket.
25
+ recursive (bool): A flag indicating whether to recursively access the GCS bucket. Defaults to False.
26
+ extension (list[str]): A list of file extensions to filter the files in the GCS bucket. Defaults to None.
27
+ """
28
+ remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
29
+ service_account_key: str = Field(validation_alias=AliasChoices("serviceAccountKey","service_account_key"))
30
+ recursive: bool = False
31
+ extension: list[str] = Field(default=None)
32
+ class Gcs(IntegrationStrategy):
33
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
34
+ super().__init__(knowledgebase_path, data)
35
+ self.__data = GcsParams.model_validate(self.data)
36
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
37
+ def working_subdirectory(self) -> str:
38
+ return 'gcs'
39
+ def run(self) -> None:
40
+ indexer_config = GcsIndexerConfig(
41
+ remote_url=self.__data.remote_url,
42
+ recursive=self.__data.recursive,
43
+ #sample_n_files=1
44
+ )
45
+ downloader_config = GcsDownloaderConfig(
46
+ download_dir=self.working_directory
47
+ )
48
+ connection_config = GcsConnectionConfig(
49
+ access_config=GcsAccessConfig(
50
+ service_account_key=self.__data.service_account_key
51
+ )
52
+ )
53
+ self.__unstructured_ingest.pipeline(
54
+ indexer_config,
55
+ downloader_config,
56
+ connection_config,
57
+ extension=self.__data.extension).run()
58
+ async def load(self) -> list[Document]:
59
+ await asyncio.to_thread(self.run)
60
+ await asyncio.sleep(1)
61
+ return await Loader(self.working_directory).load()
62
+
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  from typing import Optional, Union
3
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
4
+ from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
4
5
  from unstructured_ingest.connector.git import GitAccessConfig
5
6
  from unstructured_ingest.connector.github import SimpleGitHubConfig
6
7
  from unstructured_ingest.runner import GithubRunner
@@ -9,6 +10,15 @@ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
9
10
  from pydantic import BaseModel, Field, AliasChoices
10
11
 
11
12
  class GithubParams(BaseModel):
13
+ """
14
+ GithubParams is a model for storing parameters required to interact with a GitHub repository.
15
+
16
+ Attributes:
17
+ repo (str): The name of the GitHub repository, e.g., 'companyname/reponame'
18
+ access_token (Optional[str]): The access token for authenticating with GitHub, e.g., 'ghp_1234567890'.
19
+ branch (Optional[str]): The branch of the repository to interact with. Defaults to 'main'.
20
+ file_ext (Optional[list[str]]): A list of file extensions to filter by, e.g. ['.md', '.pdf']. Defaults to an empty list.
21
+ """
12
22
  repo: str
13
23
  access_token: Optional[str] | None = Field(None,validation_alias=AliasChoices("accessToken","access_token"))
14
24
  branch: Optional[str] = 'main'
@@ -17,8 +27,6 @@ class Github(IntegrationStrategy):
17
27
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
18
28
  super().__init__(knowledgebase_path, data)
19
29
  self.__data = GithubParams.model_validate(self.data)
20
- self.__loader = Loader(self.working_directory)
21
- self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
22
30
  def working_subdirectory(self) -> str:
23
31
  return 'github'
24
32
  def run(self) -> None:
@@ -35,12 +43,13 @@ class Github(IntegrationStrategy):
35
43
  )
36
44
  runner = GithubRunner(
37
45
  connector_config=config,
38
- processor_config=self.__unstructured_ingest.processor_config(),
39
- read_config=self.__unstructured_ingest.read_config(),
40
- partition_config=self.__unstructured_ingest.partition_config(),
41
- retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
46
+ processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
47
+ read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
48
+ partition_config=None,
49
+ retry_strategy_config=None
42
50
  )
43
51
  runner.run()
44
52
  async def load(self) -> list[Document]:
45
53
  await asyncio.to_thread(self.run)
46
- return await self.__loader.load()
54
+ await asyncio.sleep(1)
55
+ return await Loader(self.working_directory).load()
@@ -0,0 +1,69 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+ class GoogleDriveParams(BaseModel):
9
+ """
10
+ GoogleDriveParams is a model that holds parameters for Google Drive integration.
11
+
12
+ Attributes:
13
+ service_account_key (dict): The service account key for Google Drive API authentication \n
14
+ - detail: https://developers.google.com/workspace/guides/create-credentials#service-accountc \n
15
+ - create a service account key, download the JSON file, and pass the content of the JSON file as a dictionary \n
16
+ - e.g., {
17
+ "type": "service_account",
18
+ "project_id": "demo-project-123456",
19
+ "private_key_id": "**********",
20
+ "private_key": "-----BEGIN PRIVATE KEY-----...----END PRIVATE KEY-----",
21
+ "client_email": "demo-client@demo-project-123456.iam.gserviceaccount.com",
22
+ "client_id": "123456",
23
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
24
+ "token_uri": "https://oauth2.googleapis.com/token",
25
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
26
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/demo-client%40demo-project-123456.iam.gserviceaccount.com",
27
+ "universe_domain": "googleapis.com"
28
+ }
29
+ - enable Google Drive API: https://console.cloud.google.com/marketplace/product/google/drive.googleapis.com
30
+ - copy email address of the service account and share the Google Drive with the email address: https://www.youtube.com/watch?v=ykJQzEe_2dM&t=2s
31
+
32
+ drive_id (str): The {folder_id} of the Google Drive to interact with, e.g., https://drive.google.com/drive/folders/{folder_id}
33
+ extensions (list[str]): A list of file extensions to filter the files in the Google Drive, e.g., ['.pdf', '.docx'].
34
+ recursive (bool): A flag indicating whether to search files recursively in the Google Drive.
35
+ """
36
+ service_account_key: dict = Field(validation_alias=AliasChoices("serviceAccountKey","service_account_key"))
37
+ drive_id: str = Field(validation_alias=AliasChoices("driveId","drive_id"))
38
+ extensions: list[str] = []
39
+ recursive: bool = False
40
+ class GoogleDrive(IntegrationStrategy):
41
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
42
+ super().__init__(knowledgebase_path, data)
43
+ self.__data = GoogleDriveParams.model_validate(self.data)
44
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
45
+ def working_subdirectory(self) -> str:
46
+ return 'googledrive'
47
+ def run(self) -> None:
48
+ indexer_config = GoogleDriveIndexerConfig(
49
+ extensions=self.__data.extensions,
50
+ recursive=self.__data.recursive
51
+ )
52
+ downloader_config = GoogleDriveDownloaderConfig(
53
+ download_dir=self.working_directory
54
+ )
55
+ connection_config = GoogleDriveConnectionConfig(
56
+ access_config=GoogleDriveAccessConfig(
57
+ service_account_key=self.__data.service_account_key
58
+ ),
59
+ drive_id=self.__data.drive_id
60
+ )
61
+ self.__unstructured_ingest.pipeline(
62
+ indexer_config,
63
+ downloader_config,
64
+ connection_config).run()
65
+ async def load(self) -> list[Document]:
66
+ await asyncio.to_thread(self.run)
67
+ await asyncio.sleep(1)
68
+ return await Loader(self.working_directory).load()
69
+
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
3
4
  from unstructured_ingest.connector.jira import SimpleJiraConfig, JiraAccessConfig
4
5
  from unstructured_ingest.runner import JiraRunner
5
6
  from langchain_core.documents import Document
@@ -8,6 +9,17 @@ from pydantic import BaseModel, Field, AliasChoices
8
9
  from typing import Optional, Union
9
10
 
10
11
  class JiraParams(BaseModel):
12
+ """
13
+ JiraParams is a Pydantic model that represents the parameters required to interact with a Jira instance.
14
+
15
+ Attributes:
16
+ url (str): The URL of the Jira instance, e.g., 'https://example.atlassian.net'.
17
+ access_token (str): The access token for authenticating with the Jira API.
18
+ user_email (str): The email address of the Jira user.
19
+ projects (list[str]): A list of project keys or IDs to interact with, e.g., ['SCRUM', 'PROJ1'].
20
+ boards (Optional[list[str]]): An optional list of board IDs to interact with. Defaults to None, e.g., ['1', '2'].
21
+ issues (Optional[list[str]]): An optional list of issue keys or IDs to interact with. Defaults to None, e.g., ['SCRUM-1', 'PROJ1-1'].
22
+ """
11
23
  url: str
12
24
  access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
13
25
  user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
@@ -18,8 +30,6 @@ class Jira(IntegrationStrategy):
18
30
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
19
31
  super().__init__(knowledgebase_path, data)
20
32
  self.__data = JiraParams.model_validate(self.data)
21
- self.__loader = Loader(self.working_directory)
22
- self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
23
33
  def working_subdirectory(self) -> str:
24
34
  return 'jira'
25
35
  def run(self) -> None:
@@ -36,13 +46,13 @@ class Jira(IntegrationStrategy):
36
46
  )
37
47
  runner = JiraRunner(
38
48
  connector_config=config,
39
- processor_config=self.__unstructured_ingest.processor_config(),
40
- read_config=self.__unstructured_ingest.read_config(),
41
- partition_config=self.__unstructured_ingest.partition_config(),
42
- retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
49
+ processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
50
+ read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
51
+ partition_config=None,
52
+ retry_strategy_config=None
43
53
  )
44
54
  runner.run()
45
55
  async def load(self) -> list[Document]:
46
56
  await asyncio.to_thread(self.run)
47
57
  await asyncio.sleep(1)
48
- return await self.__loader.load()
58
+ return await Loader(self.working_directory).load()
@@ -1,16 +1,31 @@
1
1
  from typing import Type
2
+ from ws_bom_robot_app.llm.vector_store.integration.azure import Azure
2
3
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
4
  from ws_bom_robot_app.llm.vector_store.integration.confluence import Confluence
5
+ from ws_bom_robot_app.llm.vector_store.integration.dropbox import Dropbox
6
+ from ws_bom_robot_app.llm.vector_store.integration.gcs import Gcs
4
7
  from ws_bom_robot_app.llm.vector_store.integration.github import Github
8
+ from ws_bom_robot_app.llm.vector_store.integration.googledrive import GoogleDrive
5
9
  from ws_bom_robot_app.llm.vector_store.integration.jira import Jira
10
+ from ws_bom_robot_app.llm.vector_store.integration.s3 import S3
11
+ from ws_bom_robot_app.llm.vector_store.integration.sftp import Sftp
6
12
  from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
13
+ from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
7
14
 
8
15
  class IntegrationManager:
9
16
  _list: dict[str, Type[IntegrationStrategy]] = {
10
- "llmkbsitemap": Sitemap,
17
+ "llmkbazure": Azure,
18
+ "llmkbconfluence": Confluence,
19
+ "llmkbdropbox": Dropbox,
11
20
  "llmkbgithub": Github,
21
+ "llmkbgcs": Gcs,
22
+ "llmkbgoogledrive": GoogleDrive,
12
23
  "llmkbjira": Jira,
13
- "llmkbconfluence": Confluence,
24
+ "llmkbs3": S3,
25
+ "llmkbsftp": Sftp,
26
+ "llmkbsitemap": Sitemap,
27
+ "llmkbslack": Slack,
28
+
14
29
  }
15
30
  @classmethod
16
31
  def get_strategy(cls, name: str, knowledgebase_path: str, data: dict[str, str]) -> IntegrationStrategy:
@@ -0,0 +1,64 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union, Optional
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+ class S3Params(BaseModel):
9
+ """
10
+ S3Params is a data model for storing parameters required to interact with an S3 bucket.
11
+ Documentation:
12
+ - ceate S3 bucket: https://docs.aws.amazon.com/AmazonS3/latest/userguide/GetStartedWithS3.html#creating-bucket
13
+ - enable authenticated bucket access: https://docs.aws.amazon.com/AmazonS3/latest/userguide/walkthrough1.html
14
+ - set policies s3:ListBucket and s3:GetObject: https://docs.aws.amazon.com/AmazonS3/latest/userguide/example-policies-s3.html
15
+ - generate key/secret: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html#Using_CreateAccessKey
16
+ - optionally create STS token: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_request.html#api_getsessiontoken
17
+
18
+ Attributes:
19
+ remote_url (str): The URL of the remote S3 bucket, e.g., 's3://demo-bucket' or 's3://demo-bucket/sub-directory'.
20
+ key (Optional[str]): The AWS access key ID for the authenticated AWS IAM user, e.g., 'AKIAIOSFODNN7EXAMPLE'.
21
+ secret (Optional[str]): The corresponding AWS secret access key, e.g., 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'.
22
+ token (Optional[str]): If required, the AWS STS session token for temporary access. Default is None.
23
+ recursive (bool): A flag indicating whether to perform operations recursively. Default is False.
24
+ extension (list[str]): A list of file extensions to filter the files. Default is None. e.g., ['.pdf', '.docx'].
25
+ """
26
+ remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
27
+ key: str
28
+ secret: str
29
+ token: Optional[str] = None
30
+ recursive: bool = False
31
+ extension: list[str] = Field(default=None)
32
+ class S3(IntegrationStrategy):
33
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
34
+ super().__init__(knowledgebase_path, data)
35
+ self.__data = S3Params.model_validate(self.data)
36
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
37
+ def working_subdirectory(self) -> str:
38
+ return 's3'
39
+ def run(self) -> None:
40
+ indexer_config = S3IndexerConfig(
41
+ remote_url=self.__data.remote_url,
42
+ recursive=self.__data.recursive,
43
+ #sample_n_files=1
44
+ )
45
+ downloader_config = S3DownloaderConfig(
46
+ download_dir=self.working_directory
47
+ )
48
+ connection_config = S3ConnectionConfig(
49
+ access_config=S3AccessConfig(
50
+ key=self.__data.key,
51
+ secret=self.__data.secret,
52
+ token=self.__data.token
53
+ )
54
+ )
55
+ self.__unstructured_ingest.pipeline(
56
+ indexer_config,
57
+ downloader_config,
58
+ connection_config,
59
+ extension=self.__data.extension).run()
60
+ async def load(self) -> list[Document]:
61
+ await asyncio.to_thread(self.run)
62
+ await asyncio.sleep(1)
63
+ return await Loader(self.working_directory).load()
64
+
@@ -0,0 +1,64 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union, Optional
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+ class SftpParams(BaseModel):
9
+ """
10
+ SftpParams is a model that defines the parameters required for SFTP integration.
11
+
12
+ Attributes:
13
+ remote_url (str): The URL of the remote SFTP server, e.g. 'sftp://example.com' or 'sftp://example.com/directory'.
14
+ host (Optional[str]): The hostname or IP address of the SFTP server. Defaults to None and inferred from remote_url
15
+ port (Optional[int]): The port number to connect to on the SFTP server. Defaults to 22.
16
+ username (str): The username to authenticate with the SFTP server.
17
+ password (str): The password to authenticate with the SFTP server.
18
+ recursive (bool): Whether to perform recursive operations. Defaults to False.
19
+ extension (list[str]): A list of file extensions to filter by. Defaults to None, e.g. ['.pdf', '.docx'].
20
+ """
21
+ remote_url: str = Field(validation_alias=AliasChoices("remoteUrl","remote_url"))
22
+ host: Optional[str] = None
23
+ port: Optional[int] = 22
24
+ username: str
25
+ password: str
26
+ recursive: bool = False
27
+ extension: list[str] = Field(default=None)
28
+ class Sftp(IntegrationStrategy):
29
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
30
+ super().__init__(knowledgebase_path, data)
31
+ self.__data = SftpParams.model_validate(self.data)
32
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
33
+ def working_subdirectory(self) -> str:
34
+ return 'sftp'
35
+ def run(self) -> None:
36
+ indexer_config = SftpIndexerConfig(
37
+ remote_url=self.__data.remote_url,
38
+ recursive=self.__data.recursive,
39
+ #sample_n_files=1
40
+ )
41
+ downloader_config = SftpDownloaderConfig(
42
+ download_dir=self.working_directory,
43
+ remote_url=self.__data.remote_url
44
+ )
45
+ connection_config = SftpConnectionConfig(
46
+ access_config=SftpAccessConfig(
47
+ password=self.__data.password
48
+ ),
49
+ username=self.__data.username,
50
+ host=self.__data.host,
51
+ port=self.__data.port,
52
+ look_for_keys=False,
53
+ allow_agent=False
54
+ )
55
+ self.__unstructured_ingest.pipeline(
56
+ indexer_config,
57
+ downloader_config,
58
+ connection_config,
59
+ extension=self.__data.extension).run()
60
+ async def load(self) -> list[Document]:
61
+ await asyncio.to_thread(self.run)
62
+ await asyncio.sleep(1)
63
+ return await Loader(self.working_directory).load()
64
+
@@ -0,0 +1,57 @@
1
+ import asyncio
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from unstructured_ingest.v2.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
4
+ from langchain_core.documents import Document
5
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
+ from typing import Union
7
+ from pydantic import BaseModel, Field, AliasChoices
8
+ from datetime import datetime, timedelta
9
+
10
+ class SlackParams(BaseModel):
11
+ """
12
+ SlackParams is a data model for storing Slack integration parameters.
13
+ Documentation:
14
+ - create slack app: https://api.slack.com/quickstart#creating
15
+ - set channels:history scope: https://api.slack.com/quickstart#scopes
16
+ - installing app/get token: https://api.slack.com/quickstart#installing
17
+ - add app to channel/s
18
+
19
+ Attributes:
20
+ token (str): The authentication token for accessing the Slack API.
21
+ channels (list[str]): A list of Slack channel IDs, e.g. ['C01B2PZQX1V'].
22
+ num_days (int, optional): The number of days to retrieve messages from. Defaults to 7.
23
+ extension (list[str], optional): A list of file extensions to filter messages by, e.g. [".xml"]. Defaults to None.
24
+ """
25
+ token: str
26
+ channels: list[str]
27
+ num_days: int = Field(default=7,validation_alias=AliasChoices("numDays","num_days"))
28
+ extension: list[str] = Field(default=None)
29
+ class Slack(IntegrationStrategy):
30
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
31
+ super().__init__(knowledgebase_path, data)
32
+ self.__data = SlackParams.model_validate(self.data)
33
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
34
+ def working_subdirectory(self) -> str:
35
+ return 'slack'
36
+ def run(self) -> None:
37
+ indexer_config = SlackIndexerConfig(
38
+ channels=self.__data.channels,
39
+ start_date=datetime.now() - timedelta(days=self.__data.num_days),
40
+ end_date=datetime.now()
41
+ )
42
+ downloader_config = SlackDownloaderConfig(
43
+ download_dir=self.working_directory
44
+ )
45
+ connection_config = SlackConnectionConfig(
46
+ access_config=SlackAccessConfig(token=self.__data.token)
47
+ )
48
+ self.__unstructured_ingest.pipeline(
49
+ indexer_config,
50
+ downloader_config,
51
+ connection_config,
52
+ extension=self.__data.extension).run()
53
+ async def load(self) -> list[Document]:
54
+ await asyncio.to_thread(self.run)
55
+ await asyncio.sleep(1)
56
+ return await Loader(self.working_directory).load()
57
+
@@ -1,15 +1,14 @@
1
1
 
2
- import asyncio
3
- from ws_bom_robot_app.config import config
4
- from typing import Any, Callable, Generator, Optional, Tuple
2
+ import asyncio, gc, logging, os, traceback
3
+ from typing import Any, Optional
5
4
  from langchain_community.document_loaders import DirectoryLoader
6
5
  from langchain_community.document_loaders.base import BaseLoader
7
6
  from langchain_community.document_loaders.merge import MergedDataLoader
8
7
  from langchain_core.documents import Document
9
8
  from langchain_unstructured import UnstructuredLoader
10
9
  from pydantic import BaseModel
10
+ from ws_bom_robot_app.config import config
11
11
  from ws_bom_robot_app.llm.vector_store.loader.json_loader import JsonLoader
12
- import gc, logging
13
12
 
14
13
  class LoaderConfig(BaseModel):
15
14
  loader: type[BaseLoader]
@@ -94,7 +93,7 @@ class Loader():
94
93
  for loader_config in loader_configs.values():
95
94
  loaders.append(
96
95
  DirectoryLoader(
97
- self.knowledgebase_path,
96
+ os.path.abspath(self.knowledgebase_path),
98
97
  glob=loader_config["glob_patterns"],
99
98
  loader_cls=loader_config["loader_cls"],
100
99
  loader_kwargs=loader_config["loader_kwargs"],
@@ -110,17 +109,23 @@ class Loader():
110
109
  #@timer
111
110
  async def load(self) -> list[Document]:
112
111
  MAX_RETRIES = 3
113
- loaders = MergedDataLoader(self.__directory_loader())
112
+ loaders: MergedDataLoader = MergedDataLoader(self.__directory_loader())
114
113
  try:
115
114
  for attempt in range(MAX_RETRIES):
116
115
  try:
117
- return await loaders.aload()
118
- #return await [doc async for doc in loaders.alazy_load()]
116
+ _documents = []
117
+ async for document in loaders.alazy_load():
118
+ _documents.append(document)
119
+ return _documents
119
120
  except Exception as e:
120
121
  logging.warning(f"Attempt {attempt+1} load document failed: {e}")
121
122
  await asyncio.sleep(1)
122
123
  if attempt == MAX_RETRIES - 1:
123
- logging.error(f"Failed to load documents: {e}")
124
+ tb = traceback.format_exc()
125
+ logging.error(f"Failed to load documents: {e} | {tb}")
124
126
  return []
127
+ finally:
128
+ del _documents
125
129
  finally:
130
+ del loaders
126
131
  gc.collect()
@@ -75,8 +75,14 @@ def diag(authenticate: bool = Depends(authenticate)):
75
75
  from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager as wsim
76
76
  from ws_bom_robot_app.llm.tools.tool_manager import ToolManager as wstm
77
77
  from ws_bom_robot_app.llm.agent_description import AgentDescriptor as wsad
78
+
78
79
  svmem = psutil.virtual_memory()
79
80
  swap = psutil.swap_memory()
81
+ try:
82
+ ws_bom_robot_app_version = pkg_resources.get_distribution("ws_bom_robot_app").version
83
+ except:
84
+ ws_bom_robot_app_version = "unknown"
85
+ peer_process_ids = [c.pid for c in psutil.Process(os.getppid()).children()] if config.runtime_options().is_multi_process else None
80
86
  return {
81
87
  "status":"ok",
82
88
  "uptime": {'from':_uptime,'elapsed':str(datetime.datetime.now()-_uptime)},
@@ -117,8 +123,9 @@ def diag(authenticate: bool = Depends(authenticate)):
117
123
  "os": {
118
124
  "ppid": os.getppid(),
119
125
  "pid": os.getpid(),
126
+ "pids": peer_process_ids,
120
127
  "cwd": os.getcwd(),
121
- "ws_bom_robot_app": pkg_resources.get_distribution("ws_bom_robot_app").version,
128
+ "ws_bom_robot_app": ws_bom_robot_app_version,
122
129
  "env": os.environ,
123
130
  },
124
131
  },
@@ -20,12 +20,19 @@ faiss-cpu==1.9.0
20
20
  #loaders
21
21
  python-magic==0.4.27
22
22
  opencv-python-headless==4.10.0.84 #docker specs
23
- unstructured[all-docs]==0.15.14
23
+ unstructured[all-docs]==0.16.11
24
24
  langchain_unstructured==0.1.5
25
25
  unstructured-ingest==0.3.8
26
+ unstructured-ingest[azure]
26
27
  unstructured-ingest[confluence]
28
+ unstructured-ingest[dropbox]
29
+ unstructured-ingest[gcs]
30
+ unstructured-ingest[google_drive]
27
31
  unstructured-ingest[github]
28
32
  unstructured-ingest[jira]
33
+ unstructured-ingest[s3]
34
+ unstructured-ingest[slack]
35
+ unstructured-ingest[sftp]
29
36
  html5lib==1.1 #beautifulsoup4 parser
30
37
 
31
38
  #integrations
@@ -116,6 +116,7 @@ class TaskStatistics(BaseModel):
116
116
  class TaskStatisticExecutionInfo(BaseModel):
117
117
  retention_days: float = config.robot_task_retention_days
118
118
  max_concurrent: int
119
+ pid: int = os.getpid()
119
120
  running: list[TaskStatus]
120
121
  slowest: list
121
122
  class TaskStatisticExecutionTime(BaseModel):
@@ -169,7 +170,7 @@ class TaskManagerStrategy(ABC):
169
170
  pass
170
171
 
171
172
  def task_cleanup_rule(self, task: TaskEntry) -> bool:
172
- return task.status.status in {"completed", "failure"} and datetime.fromisoformat(task.status.metadata.end_at) < datetime.now() - timedelta(days=config.robot_task_retention_days)
173
+ return task.status.metadata.start_at and datetime.fromisoformat(task.status.metadata.start_at) < datetime.now() - timedelta(days=config.robot_task_retention_days)
173
174
 
174
175
  def task_done_callback(self, task_entry: TaskEntry, headers: TaskHeader | None = None) -> Callable:
175
176
  def callback(task: asyncio.Task):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.21
3
+ Version: 0.0.23
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -23,12 +23,19 @@ Requires-Dist: langchain-core==0.3.21
23
23
  Requires-Dist: faiss-cpu==1.9.0
24
24
  Requires-Dist: python-magic==0.4.27
25
25
  Requires-Dist: opencv-python-headless==4.10.0.84
26
- Requires-Dist: unstructured[all-docs]==0.15.14
26
+ Requires-Dist: unstructured[all-docs]==0.16.11
27
27
  Requires-Dist: langchain_unstructured==0.1.5
28
28
  Requires-Dist: unstructured-ingest==0.3.8
29
+ Requires-Dist: unstructured-ingest[azure]
29
30
  Requires-Dist: unstructured-ingest[confluence]
31
+ Requires-Dist: unstructured-ingest[dropbox]
32
+ Requires-Dist: unstructured-ingest[gcs]
33
+ Requires-Dist: unstructured-ingest[google_drive]
30
34
  Requires-Dist: unstructured-ingest[github]
31
35
  Requires-Dist: unstructured-ingest[jira]
36
+ Requires-Dist: unstructured-ingest[s3]
37
+ Requires-Dist: unstructured-ingest[slack]
38
+ Requires-Dist: unstructured-ingest[sftp]
32
39
  Requires-Dist: html5lib==1.1
33
40
  Requires-Dist: markdownify==0.14.1
34
41
  Requires-Dist: nebuly==0.3.33
@@ -207,6 +214,13 @@ launch debugger
207
214
  streamlit run debugger.py --server.port 6002
208
215
  ```
209
216
 
217
+ dockerize app from src
218
+
219
+ ```pwsh
220
+ docker build -f Dockerfile-src -t ws-bom-robot-app:src .
221
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
222
+ ```
223
+
210
224
  ### ✈️ publish
211
225
 
212
226
  - [testpypi](https://test.pypi.org/project/ws-bom-robot-app/)
@@ -42,12 +42,19 @@ ws_bom_robot_app/llm/utils/webhooks.py
42
42
  ws_bom_robot_app/llm/vector_store/__init__.py
43
43
  ws_bom_robot_app/llm/vector_store/generator.py
44
44
  ws_bom_robot_app/llm/vector_store/integration/__init__.py
45
+ ws_bom_robot_app/llm/vector_store/integration/azure.py
45
46
  ws_bom_robot_app/llm/vector_store/integration/base.py
46
47
  ws_bom_robot_app/llm/vector_store/integration/confluence.py
48
+ ws_bom_robot_app/llm/vector_store/integration/dropbox.py
49
+ ws_bom_robot_app/llm/vector_store/integration/gcs.py
47
50
  ws_bom_robot_app/llm/vector_store/integration/github.py
51
+ ws_bom_robot_app/llm/vector_store/integration/googledrive.py
48
52
  ws_bom_robot_app/llm/vector_store/integration/jira.py
49
53
  ws_bom_robot_app/llm/vector_store/integration/manager.py
54
+ ws_bom_robot_app/llm/vector_store/integration/s3.py
55
+ ws_bom_robot_app/llm/vector_store/integration/sftp.py
50
56
  ws_bom_robot_app/llm/vector_store/integration/sitemap.py
57
+ ws_bom_robot_app/llm/vector_store/integration/slack.py
51
58
  ws_bom_robot_app/llm/vector_store/loader/__init__.py
52
59
  ws_bom_robot_app/llm/vector_store/loader/base.py
53
60
  ws_bom_robot_app/llm/vector_store/loader/json_loader.py
@@ -11,12 +11,19 @@ langchain-core==0.3.21
11
11
  faiss-cpu==1.9.0
12
12
  python-magic==0.4.27
13
13
  opencv-python-headless==4.10.0.84
14
- unstructured[all-docs]==0.15.14
14
+ unstructured[all-docs]==0.16.11
15
15
  langchain_unstructured==0.1.5
16
16
  unstructured-ingest==0.3.8
17
+ unstructured-ingest[azure]
17
18
  unstructured-ingest[confluence]
19
+ unstructured-ingest[dropbox]
20
+ unstructured-ingest[gcs]
21
+ unstructured-ingest[google_drive]
18
22
  unstructured-ingest[github]
19
23
  unstructured-ingest[jira]
24
+ unstructured-ingest[s3]
25
+ unstructured-ingest[slack]
26
+ unstructured-ingest[sftp]
20
27
  html5lib==1.1
21
28
  markdownify==0.14.1
22
29
  nebuly==0.3.33
@@ -1,44 +0,0 @@
1
- import os
2
- from langchain_core.documents import Document
3
- from abc import ABC, abstractmethod
4
- from unstructured_ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig, RetryStrategyConfig
5
- from typing import Union
6
-
7
- class IntegrationStrategy(ABC):
8
- def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
9
- self.knowledgebase_path = knowledgebase_path
10
- self.data = data
11
- self.working_directory = os.path.join(self.knowledgebase_path,self.working_subdirectory())
12
- os.makedirs(self.working_directory, exist_ok=True)
13
- @property
14
- @abstractmethod
15
- def working_subdirectory(self) -> str:
16
- pass
17
- @abstractmethod
18
- #@timer
19
- def load(self) -> list[Document]:
20
- pass
21
-
22
- class UnstructuredIngest():
23
- def __init__(self, working_directory: str):
24
- self.working_directory = working_directory
25
- def processor_config(self) -> ProcessorConfig:
26
- return ProcessorConfig(
27
- reprocess=False,
28
- verbose=False,
29
- work_dir=os.path.abspath(self.working_directory),
30
- #output_dir=os.path.abspath(self.working_directory),
31
- num_processes=1,
32
- raise_on_error=False
33
- )
34
- def read_config(self) -> ReadConfig:
35
- return ReadConfig(
36
- download_dir=os.path.abspath(self.working_directory),
37
- re_download=True,
38
- preserve_downloads=True,
39
- download_only=True
40
- )
41
- def partition_config(self) -> PartitionConfig:
42
- return None
43
- def retry_strategy_config(self) -> RetryStrategyConfig:
44
- return None
@@ -1,47 +0,0 @@
1
- import asyncio
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.connector.confluence import SimpleConfluenceConfig, ConfluenceAccessConfig
4
- from unstructured_ingest.runner import ConfluenceRunner
5
- from langchain_core.documents import Document
6
- from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
- from typing import Optional, Union
8
- from pydantic import BaseModel, Field, AliasChoices
9
-
10
- class ConfluenceParams(BaseModel):
11
- url: str
12
- access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
13
- user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
14
- spaces: list[str] = []
15
- class Confluence(IntegrationStrategy):
16
- def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
17
- super().__init__(knowledgebase_path, data)
18
- self.__data = ConfluenceParams.model_validate(self.data)
19
- self.__loader = Loader(self.working_directory)
20
- self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
21
- def working_subdirectory(self) -> str:
22
- return 'confluence'
23
- def run(self) -> None:
24
- access_config = ConfluenceAccessConfig(
25
- api_token=self.__data.access_token
26
- )
27
- config = SimpleConfluenceConfig(
28
- user_email=self.__data.user_email,
29
- url = self.__data.url,
30
- access_config=access_config,
31
- #max_num_of_spaces=self.data.get('max_num_of_spaces',500),
32
- #max_num_of_docs_from_each_space=self.data.get('max_num_of_docs_from_each_space',100),
33
- spaces=self.__data.spaces
34
- )
35
- runner = ConfluenceRunner(
36
- connector_config=config,
37
- processor_config=self.__unstructured_ingest.processor_config(),
38
- read_config=self.__unstructured_ingest.read_config(),
39
- partition_config=self.__unstructured_ingest.partition_config(),
40
- retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
41
- )
42
- runner.run()
43
- async def load(self) -> list[Document]:
44
- await asyncio.to_thread(self.run)
45
- await asyncio.sleep(1)
46
- return await self.__loader.load()
47
-