ws-bom-robot-app 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
1
  import os
2
2
  from langchain_core.documents import Document
3
3
  from abc import ABC, abstractmethod
4
- from unstructured_ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig, RetryStrategyConfig
4
+ from unstructured_ingest.v2.interfaces import ProcessorConfig
5
+ from unstructured_ingest.v2.pipeline.pipeline import Pipeline, PartitionerConfig, FiltererConfig
5
6
  from typing import Union
6
7
 
7
8
  class IntegrationStrategy(ABC):
@@ -9,7 +10,7 @@ class IntegrationStrategy(ABC):
9
10
  self.knowledgebase_path = knowledgebase_path
10
11
  self.data = data
11
12
  self.working_directory = os.path.join(self.knowledgebase_path,self.working_subdirectory())
12
- os.makedirs(self.working_directory, exist_ok=True)
13
+ os.makedirs(self.working_directory, mode=666, exist_ok=True)
13
14
  @property
14
15
  @abstractmethod
15
16
  def working_subdirectory(self) -> str:
@@ -22,23 +23,21 @@ class IntegrationStrategy(ABC):
22
23
  class UnstructuredIngest():
23
24
  def __init__(self, working_directory: str):
24
25
  self.working_directory = working_directory
25
- def processor_config(self) -> ProcessorConfig:
26
- return ProcessorConfig(
27
- reprocess=False,
28
- verbose=False,
29
- work_dir=self.working_directory,
30
- output_dir=self.working_directory,
31
- num_processes=1,
32
- raise_on_error=False
26
+ def pipeline(self,indexer,downloader,connection) -> Pipeline:
27
+ return Pipeline.from_configs(
28
+ context=ProcessorConfig(
29
+ reprocess=False,
30
+ verbose=False,
31
+ tqdm=False,
32
+ num_processes=2,
33
+ preserve_downloads=True,
34
+ download_only=True,
35
+ raise_on_error=False
36
+ ),
37
+ indexer_config=indexer,
38
+ downloader_config=downloader,
39
+ source_connection_config=connection,
40
+ partitioner_config=PartitionerConfig(),
41
+ filterer_config=FiltererConfig()
33
42
  )
34
- def read_config(self) -> ReadConfig:
35
- return ReadConfig(
36
- download_dir=self.working_directory,
37
- re_download=True,
38
- preserve_downloads=True,
39
- download_only=True
40
- )
41
- def partition_config(self) -> PartitionConfig:
42
- return None
43
- def retry_strategy_config(self) -> RetryStrategyConfig:
44
- return None
43
+
@@ -1,10 +1,9 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.connector.confluence import SimpleConfluenceConfig, ConfluenceAccessConfig
4
- from unstructured_ingest.runner import ConfluenceRunner
3
+ from unstructured_ingest.v2.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
5
4
  from langchain_core.documents import Document
6
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
- from typing import Optional, Union
6
+ from typing import Union
8
7
  from pydantic import BaseModel, Field, AliasChoices
9
8
 
10
9
  class ConfluenceParams(BaseModel):
@@ -16,32 +15,27 @@ class Confluence(IntegrationStrategy):
16
15
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
17
16
  super().__init__(knowledgebase_path, data)
18
17
  self.__data = ConfluenceParams.model_validate(self.data)
19
- self.__loader = Loader(self.working_directory)
20
18
  self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
21
19
  def working_subdirectory(self) -> str:
22
20
  return 'confluence'
23
21
  def run(self) -> None:
24
- access_config = ConfluenceAccessConfig(
25
- api_token=self.__data.access_token
26
- )
27
- config = SimpleConfluenceConfig(
28
- user_email=self.__data.user_email,
29
- url = self.__data.url,
30
- access_config=access_config,
31
- #max_num_of_spaces=self.data.get('max_num_of_spaces',500),
32
- #max_num_of_docs_from_each_space=self.data.get('max_num_of_docs_from_each_space',100),
22
+ indexer_config = ConfluenceIndexerConfig(
33
23
  spaces=self.__data.spaces
34
24
  )
35
- runner = ConfluenceRunner(
36
- connector_config=config,
37
- processor_config=self.__unstructured_ingest.processor_config(),
38
- read_config=self.__unstructured_ingest.read_config(),
39
- partition_config=self.__unstructured_ingest.partition_config(),
40
- retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
41
- )
42
- runner.run()
25
+ downloader_config = ConfluenceDownloaderConfig(
26
+ download_dir=self.working_directory
27
+ )
28
+ connection_config = ConfluenceConnectionConfig(
29
+ access_config=ConfluenceAccessConfig(api_token=self.__data.access_token),
30
+ url=self.__data.url,
31
+ user_email=self.__data.user_email
32
+ )
33
+ self.__unstructured_ingest.pipeline(
34
+ indexer_config,
35
+ downloader_config,
36
+ connection_config).run()
43
37
  async def load(self) -> list[Document]:
44
38
  await asyncio.to_thread(self.run)
45
39
  await asyncio.sleep(1)
46
- return await self.__loader.load()
40
+ return await Loader(self.working_directory).load()
47
41
 
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  from typing import Optional, Union
3
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
4
+ from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
4
5
  from unstructured_ingest.connector.git import GitAccessConfig
5
6
  from unstructured_ingest.connector.github import SimpleGitHubConfig
6
7
  from unstructured_ingest.runner import GithubRunner
@@ -17,8 +18,6 @@ class Github(IntegrationStrategy):
17
18
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
18
19
  super().__init__(knowledgebase_path, data)
19
20
  self.__data = GithubParams.model_validate(self.data)
20
- self.__loader = Loader(self.working_directory)
21
- self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
22
21
  def working_subdirectory(self) -> str:
23
22
  return 'github'
24
23
  def run(self) -> None:
@@ -35,12 +34,13 @@ class Github(IntegrationStrategy):
35
34
  )
36
35
  runner = GithubRunner(
37
36
  connector_config=config,
38
- processor_config=self.__unstructured_ingest.processor_config(),
39
- read_config=self.__unstructured_ingest.read_config(),
40
- partition_config=self.__unstructured_ingest.partition_config(),
41
- retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
37
+ processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
38
+ read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
39
+ partition_config=None,
40
+ retry_strategy_config=None
42
41
  )
43
42
  runner.run()
44
43
  async def load(self) -> list[Document]:
45
44
  await asyncio.to_thread(self.run)
46
- return await self.__loader.load()
45
+ await asyncio.sleep(1)
46
+ return await Loader(self.working_directory).load()
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
3
4
  from unstructured_ingest.connector.jira import SimpleJiraConfig, JiraAccessConfig
4
5
  from unstructured_ingest.runner import JiraRunner
5
6
  from langchain_core.documents import Document
@@ -18,8 +19,6 @@ class Jira(IntegrationStrategy):
18
19
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
19
20
  super().__init__(knowledgebase_path, data)
20
21
  self.__data = JiraParams.model_validate(self.data)
21
- self.__loader = Loader(self.working_directory)
22
- self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
23
22
  def working_subdirectory(self) -> str:
24
23
  return 'jira'
25
24
  def run(self) -> None:
@@ -36,13 +35,13 @@ class Jira(IntegrationStrategy):
36
35
  )
37
36
  runner = JiraRunner(
38
37
  connector_config=config,
39
- processor_config=self.__unstructured_ingest.processor_config(),
40
- read_config=self.__unstructured_ingest.read_config(),
41
- partition_config=self.__unstructured_ingest.partition_config(),
42
- retry_strategy_config=self.__unstructured_ingest.retry_strategy_config()
38
+ processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
39
+ read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
40
+ partition_config=None,
41
+ retry_strategy_config=None
43
42
  )
44
43
  runner.run()
45
44
  async def load(self) -> list[Document]:
46
45
  await asyncio.to_thread(self.run)
47
46
  await asyncio.sleep(1)
48
- return await self.__loader.load()
47
+ return await Loader(self.working_directory).load()
@@ -1,15 +1,14 @@
1
1
 
2
- import asyncio
3
- from ws_bom_robot_app.config import config
4
- from typing import Any, Callable, Generator, Optional, Tuple
2
+ import asyncio, gc, logging, os, traceback
3
+ from typing import Any, Optional
5
4
  from langchain_community.document_loaders import DirectoryLoader
6
5
  from langchain_community.document_loaders.base import BaseLoader
7
6
  from langchain_community.document_loaders.merge import MergedDataLoader
8
7
  from langchain_core.documents import Document
9
8
  from langchain_unstructured import UnstructuredLoader
10
9
  from pydantic import BaseModel
10
+ from ws_bom_robot_app.config import config
11
11
  from ws_bom_robot_app.llm.vector_store.loader.json_loader import JsonLoader
12
- import gc, logging
13
12
 
14
13
  class LoaderConfig(BaseModel):
15
14
  loader: type[BaseLoader]
@@ -94,7 +93,7 @@ class Loader():
94
93
  for loader_config in loader_configs.values():
95
94
  loaders.append(
96
95
  DirectoryLoader(
97
- self.knowledgebase_path,
96
+ os.path.abspath(self.knowledgebase_path),
98
97
  glob=loader_config["glob_patterns"],
99
98
  loader_cls=loader_config["loader_cls"],
100
99
  loader_kwargs=loader_config["loader_kwargs"],
@@ -110,17 +109,23 @@ class Loader():
110
109
  #@timer
111
110
  async def load(self) -> list[Document]:
112
111
  MAX_RETRIES = 3
113
- loaders = MergedDataLoader(self.__directory_loader())
112
+ loaders: MergedDataLoader = MergedDataLoader(self.__directory_loader())
114
113
  try:
115
114
  for attempt in range(MAX_RETRIES):
116
115
  try:
117
- return await loaders.aload()
118
- #return await [doc async for doc in loaders.alazy_load()]
116
+ _documents = []
117
+ async for document in loaders.alazy_load():
118
+ _documents.append(document)
119
+ return _documents
119
120
  except Exception as e:
120
121
  logging.warning(f"Attempt {attempt+1} load document failed: {e}")
121
122
  await asyncio.sleep(1)
122
123
  if attempt == MAX_RETRIES - 1:
123
- logging.error(f"Failed to load documents: {e}")
124
+ tb = traceback.format_exc()
125
+ logging.error(f"Failed to load documents: {e} | {tb}")
124
126
  return []
127
+ finally:
128
+ del _documents
125
129
  finally:
130
+ del loaders
126
131
  gc.collect()
ws_bom_robot_app/main.py CHANGED
@@ -75,8 +75,14 @@ def diag(authenticate: bool = Depends(authenticate)):
75
75
  from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager as wsim
76
76
  from ws_bom_robot_app.llm.tools.tool_manager import ToolManager as wstm
77
77
  from ws_bom_robot_app.llm.agent_description import AgentDescriptor as wsad
78
+
78
79
  svmem = psutil.virtual_memory()
79
80
  swap = psutil.swap_memory()
81
+ try:
82
+ ws_bom_robot_app_version = pkg_resources.get_distribution("ws_bom_robot_app").version
83
+ except:
84
+ ws_bom_robot_app_version = "unknown"
85
+ peer_process_ids = [c.pid for c in psutil.Process(os.getppid()).children()] if config.runtime_options().is_multi_process else None
80
86
  return {
81
87
  "status":"ok",
82
88
  "uptime": {'from':_uptime,'elapsed':str(datetime.datetime.now()-_uptime)},
@@ -117,8 +123,9 @@ def diag(authenticate: bool = Depends(authenticate)):
117
123
  "os": {
118
124
  "ppid": os.getppid(),
119
125
  "pid": os.getpid(),
126
+ "pids": peer_process_ids,
120
127
  "cwd": os.getcwd(),
121
- "ws_bom_robot_app": pkg_resources.get_distribution("ws_bom_robot_app").version,
128
+ "ws_bom_robot_app": ws_bom_robot_app_version,
122
129
  "env": os.environ,
123
130
  },
124
131
  },
@@ -116,6 +116,7 @@ class TaskStatistics(BaseModel):
116
116
  class TaskStatisticExecutionInfo(BaseModel):
117
117
  retention_days: float = config.robot_task_retention_days
118
118
  max_concurrent: int
119
+ pid: int = os.getpid()
119
120
  running: list[TaskStatus]
120
121
  slowest: list
121
122
  class TaskStatisticExecutionTime(BaseModel):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.20
3
+ Version: 0.0.22
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -23,7 +23,7 @@ Requires-Dist: langchain-core==0.3.21
23
23
  Requires-Dist: faiss-cpu==1.9.0
24
24
  Requires-Dist: python-magic==0.4.27
25
25
  Requires-Dist: opencv-python-headless==4.10.0.84
26
- Requires-Dist: unstructured[all-docs]==0.15.14
26
+ Requires-Dist: unstructured[all-docs]==0.16.11
27
27
  Requires-Dist: langchain_unstructured==0.1.5
28
28
  Requires-Dist: unstructured-ingest==0.3.8
29
29
  Requires-Dist: unstructured-ingest[confluence]
@@ -207,6 +207,13 @@ launch debugger
207
207
  streamlit run debugger.py --server.port 6002
208
208
  ```
209
209
 
210
+ dockerize app from src
211
+
212
+ ```pwsh
213
+ docker build -f Dockerfile-src -t ws-bom-robot-app:src .
214
+ docker run --name ws-bom-robot-app-src -d -v "$(pwd)/ws_bom_robot_app:/app/ws_bom_robot_app" -p 6001:6001 ws-bom-robot-app:src
215
+ ```
216
+
210
217
  ### ✈️ publish
211
218
 
212
219
  - [testpypi](https://test.pypi.org/project/ws-bom-robot-app/)
@@ -2,8 +2,8 @@ ws_bom_robot_app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
2
2
  ws_bom_robot_app/auth.py,sha256=84nIbmJsMrNs0sxIQGEHbjsjc2P6ZrZZGSn8dkiL6is,895
3
3
  ws_bom_robot_app/config.py,sha256=V5ZrX_JnzpsN32hTTezTfOvEZYkIQBy0lxIQ9JFHdFE,3170
4
4
  ws_bom_robot_app/cron_manager.py,sha256=0Yt5AMTPGlXZ_M5ck0SKMX8wvzoPsseEezg_s0Q3HKY,9224
5
- ws_bom_robot_app/main.py,sha256=PkWGkBYXCEPBxPUGsaq-Wvdcy7CkTL_0wNeE0s7MGwc,5814
6
- ws_bom_robot_app/task_manager.py,sha256=WnuJ-noOLM4AiFaEp67dGTdFJhhPQAzFvEDUtJXaFAA,15941
5
+ ws_bom_robot_app/main.py,sha256=vChP8vfmOCbs51TPUsaaxX8FvoFXuURMkOgmgx0Xi_4,6121
6
+ ws_bom_robot_app/task_manager.py,sha256=tuqyVJ52KFE9kVjVbrFrjuUKXIxtEBCOV-BLmYCzaMo,15973
7
7
  ws_bom_robot_app/util.py,sha256=b49ItlZgh2Wzw-6K8k5Wa44eVgjQ0JmWQwJnEaQBVGw,3502
8
8
  ws_bom_robot_app/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  ws_bom_robot_app/llm/agent_description.py,sha256=SDJYMmwfdMxEK3a_HDEQ19bfNKmwMSFf5hqU0VSCCIE,4705
@@ -33,16 +33,16 @@ ws_bom_robot_app/llm/utils/webhooks.py,sha256=LAAZqyN6VhV13wu4X-X85TwdDgAV2rNvIw
33
33
  ws_bom_robot_app/llm/vector_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
34
  ws_bom_robot_app/llm/vector_store/generator.py,sha256=aVUG08rcsDvtVxfYfUalO1CbKGkLazNyP555IsQQZso,5975
35
35
  ws_bom_robot_app/llm/vector_store/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
- ws_bom_robot_app/llm/vector_store/integration/base.py,sha256=PXsAXHGDHzij9OKlBhnt3IKZDBfgZcaOYa1_g_NriP0,1471
37
- ws_bom_robot_app/llm/vector_store/integration/confluence.py,sha256=cCVFMb-poXkauRuK0Z3AwGVZk3MLZzZgXBarfWlR1SY,2159
38
- ws_bom_robot_app/llm/vector_store/integration/github.py,sha256=BugkPjbqjWRN7n4LOkoqsHXZ_vo_Xff_V-PwoU6y1WE,2135
39
- ws_bom_robot_app/llm/vector_store/integration/jira.py,sha256=xq4oxRl1xhkbPjFTV6O-POxsyAuFNWEG9fZ7P7qVSQE,2090
36
+ ws_bom_robot_app/llm/vector_store/integration/base.py,sha256=dUXvibkKjnCdpNO-KrWD1Gvfw4SNdh-ujwkWRVer-dA,1464
37
+ ws_bom_robot_app/llm/vector_store/integration/confluence.py,sha256=EI_ZwBvSP0eWUPeimxMxnZRIISnbWkKpIDITBXn2bsI,1837
38
+ ws_bom_robot_app/llm/vector_store/integration/github.py,sha256=g_uSG5YNgiXdvLutCA8GAPuX3Y41JZGH8Fj5Jm5gCg0,2124
39
+ ws_bom_robot_app/llm/vector_store/integration/jira.py,sha256=FNhl_TRCltyFpD9EA9my5g-aUt6m2_ylqy34i6fGvZA,2049
40
40
  ws_bom_robot_app/llm/vector_store/integration/manager.py,sha256=YDQOgwMQxdRrVXIG3b0D6fHd1vGwQmgbAwigtWeeHW0,894
41
41
  ws_bom_robot_app/llm/vector_store/integration/sitemap.py,sha256=nPbIywp-ZwWbWStvjvYVgHqqejyYFr8eZhBc8ycTuaU,4206
42
42
  ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
- ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=qVBM2SIrPPnS__7Z8ZSkEAZ707wURhDyCK-0ctNStmE,4967
43
+ ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=ZvcyUPOoQ44gyfl80Jf0y9vbcj8uTUb-lYkb_m2pt1I,5137
44
44
  ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=qo9ejRZyKv_k6jnGgXnu1W5uqsMMtgqK_uvPpZQ0p74,833
45
- ws_bom_robot_app-0.0.20.dist-info/METADATA,sha256=XOAypwk7Ww1u-iYJZdjKu-HEJT5dH-KlPaaJO_2ms4w,6620
46
- ws_bom_robot_app-0.0.20.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
47
- ws_bom_robot_app-0.0.20.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
48
- ws_bom_robot_app-0.0.20.dist-info/RECORD,,
45
+ ws_bom_robot_app-0.0.22.dist-info/METADATA,sha256=q-ObaxLd_n38aXwFmB6I1sxqzK76AP9vT0v_q6FOChc,6848
46
+ ws_bom_robot_app-0.0.22.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
47
+ ws_bom_robot_app-0.0.22.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
48
+ ws_bom_robot_app-0.0.22.dist-info/RECORD,,