ws-bom-robot-app 0.0.80__py3-none-any.whl → 0.0.82__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/config.py +10 -0
- ws_bom_robot_app/cron_manager.py +6 -6
- ws_bom_robot_app/llm/api.py +2 -2
- ws_bom_robot_app/llm/providers/llm_manager.py +5 -6
- ws_bom_robot_app/llm/utils/cleanup.py +7 -0
- ws_bom_robot_app/llm/utils/download.py +0 -2
- ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/base.py +57 -15
- ws_bom_robot_app/llm/vector_store/integration/confluence.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
- ws_bom_robot_app/llm/vector_store/integration/googledrive.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/jira.py +93 -60
- ws_bom_robot_app/llm/vector_store/integration/manager.py +2 -0
- ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
- ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
- ws_bom_robot_app/llm/vector_store/integration/sitemap.py +3 -0
- ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
- ws_bom_robot_app/llm/vector_store/integration/thron.py +2 -3
- ws_bom_robot_app/llm/vector_store/loader/base.py +8 -6
- ws_bom_robot_app/llm/vector_store/loader/docling.py +1 -1
- ws_bom_robot_app/subprocess_runner.py +103 -0
- ws_bom_robot_app/task_manager.py +169 -41
- {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/METADATA +18 -8
- {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/RECORD +30 -28
- {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/WHEEL +0 -0
- {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/top_level.txt +0 -0
ws_bom_robot_app/config.py
CHANGED
|
@@ -16,9 +16,14 @@ class Settings(BaseSettings):
|
|
|
16
16
|
robot_data_db_retention_days: float = 60
|
|
17
17
|
robot_data_attachment_folder: str = 'attachment'
|
|
18
18
|
robot_data_attachment_retention_days: float = 1
|
|
19
|
+
robot_ingest_max_threads: int = 1 # safe choice to 1, avoid potential process-related issues with Docker
|
|
19
20
|
robot_loader_max_threads: int = 1
|
|
20
21
|
robot_task_max_total_parallelism: int = 2 * (os.cpu_count() or 1)
|
|
21
22
|
robot_task_retention_days: float = 1
|
|
23
|
+
robot_task_strategy: str = 'memory' # memory / db
|
|
24
|
+
robot_task_mp_enable: bool = True
|
|
25
|
+
robot_task_mp_method: str = 'spawn' # spawn / fork
|
|
26
|
+
robot_cron_strategy: str = 'memory' # memory / db
|
|
22
27
|
robot_cms_host: str = ''
|
|
23
28
|
robot_cms_auth: str = ''
|
|
24
29
|
robot_cms_db_folder: str = 'llmVectorDb'
|
|
@@ -41,6 +46,7 @@ class Settings(BaseSettings):
|
|
|
41
46
|
)
|
|
42
47
|
def __init__(self, **kwargs):
|
|
43
48
|
super().__init__(**kwargs)
|
|
49
|
+
# env
|
|
44
50
|
os.environ["USER_AGENT"] = self.USER_AGENT
|
|
45
51
|
os.environ["OPENAI_API_KEY"] = self.OPENAI_API_KEY
|
|
46
52
|
os.environ["OLLAMA_API_URL"] = self.OLLAMA_API_URL
|
|
@@ -53,6 +59,10 @@ class Settings(BaseSettings):
|
|
|
53
59
|
os.environ["WATSONX_APIKEY"] = self.WATSONX_APIKEY
|
|
54
60
|
os.environ["WATSONX_PROJECTID"] = self.WATSONX_PROJECTID
|
|
55
61
|
os.environ["NEBULY_API_URL"] = self.NEBULY_API_URL
|
|
62
|
+
# dir
|
|
63
|
+
os.makedirs(self.robot_data_folder, exist_ok=True)
|
|
64
|
+
for subfolder in [self.robot_data_db_folder, self.robot_data_attachment_folder, 'db']:
|
|
65
|
+
os.makedirs(os.path.join(self.robot_data_folder, subfolder), exist_ok=True)
|
|
56
66
|
|
|
57
67
|
class RuntimeOptions(BaseModel):
|
|
58
68
|
@staticmethod
|
ws_bom_robot_app/cron_manager.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
2
3
|
#from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
3
4
|
from apscheduler.jobstores.memory import MemoryJobStore
|
|
@@ -7,8 +8,7 @@ from apscheduler.triggers.interval import IntervalTrigger
|
|
|
7
8
|
from apscheduler.triggers.date import DateTrigger
|
|
8
9
|
from fastapi import APIRouter
|
|
9
10
|
from datetime import datetime
|
|
10
|
-
from ws_bom_robot_app.
|
|
11
|
-
from ws_bom_robot_app.llm.utils.cleanup import kb_cleanup_data_file, chat_cleanup_attachment
|
|
11
|
+
from ws_bom_robot_app.llm.utils.cleanup import kb_cleanup_data_file, chat_cleanup_attachment, task_cleanup_history
|
|
12
12
|
from ws_bom_robot_app.util import _log
|
|
13
13
|
from ws_bom_robot_app.config import config
|
|
14
14
|
|
|
@@ -22,8 +22,8 @@ class MemoryJobstoreStrategy(JobstoreStrategy):
|
|
|
22
22
|
return {"default": MemoryJobStore()}
|
|
23
23
|
|
|
24
24
|
class PersistentJobstoreStrategy(JobstoreStrategy):
|
|
25
|
-
def get_jobstore(self, db_url: str = "sqlite
|
|
26
|
-
_log.info(f"Using persistent
|
|
25
|
+
def get_jobstore(self, db_url: str = f"sqlite:///{config.robot_data_folder}/db/jobs.sqlite"):
|
|
26
|
+
_log.info(f"Using persistent cron jobstore with database URL: {db_url}.")
|
|
27
27
|
return {"default": SQLAlchemyJobStore(url=db_url)}
|
|
28
28
|
|
|
29
29
|
class Job:
|
|
@@ -56,12 +56,12 @@ class Job:
|
|
|
56
56
|
|
|
57
57
|
class CronManager:
|
|
58
58
|
_list_default = [
|
|
59
|
-
Job('cleanup-task',
|
|
59
|
+
Job('cleanup-task-history',task_cleanup_history, interval=5 * 60),
|
|
60
60
|
Job('cleanup-kb-data',kb_cleanup_data_file, interval=180 * 60),
|
|
61
61
|
Job('cleanup-chat-attachment',chat_cleanup_attachment, interval=120 * 60),
|
|
62
62
|
]
|
|
63
63
|
def __get_jobstore_strategy(self) -> JobstoreStrategy:
|
|
64
|
-
if
|
|
64
|
+
if config.robot_cron_strategy == 'memory':
|
|
65
65
|
return MemoryJobstoreStrategy()
|
|
66
66
|
return PersistentJobstoreStrategy()
|
|
67
67
|
def __init__(self, strategy: JobstoreStrategy = None, enable_defaults: bool = True):
|
ws_bom_robot_app/llm/api.py
CHANGED
|
@@ -52,7 +52,7 @@ async def _kb(rq: KbRequest) -> VectorDbResponse:
|
|
|
52
52
|
|
|
53
53
|
@router.post("/kb/task")
|
|
54
54
|
async def _kb_task(rq: KbRequest, headers: Annotated[TaskHeader, Header()]) -> IdentifiableEntity:
|
|
55
|
-
return task_manager.create_task(kb(rq),headers)
|
|
55
|
+
return task_manager.create_task(lambda: kb(rq),headers)
|
|
56
56
|
|
|
57
57
|
@router.post("/rules")
|
|
58
58
|
async def _rules(rq: RulesRequest) -> VectorDbResponse:
|
|
@@ -60,7 +60,7 @@ async def _rules(rq: RulesRequest) -> VectorDbResponse:
|
|
|
60
60
|
|
|
61
61
|
@router.post("/rules/task")
|
|
62
62
|
async def _rules_task(rq: RulesRequest, headers: Annotated[TaskHeader, Header()]) -> IdentifiableEntity:
|
|
63
|
-
return task_manager.create_task(rules(rq),headers)
|
|
63
|
+
return task_manager.create_task(lambda: rules(rq), headers)
|
|
64
64
|
|
|
65
65
|
@router.get("/kb/file/{filename}")
|
|
66
66
|
async def _kb_get_file(filename: str) -> StreamingResponse:
|
|
@@ -69,8 +69,7 @@ class Anthropic(LlmInterface):
|
|
|
69
69
|
model=self.config.model,
|
|
70
70
|
temperature=self.config.temperature,
|
|
71
71
|
max_tokens=8192,
|
|
72
|
-
streaming=True
|
|
73
|
-
stream_usage=True
|
|
72
|
+
streaming=True
|
|
74
73
|
)
|
|
75
74
|
|
|
76
75
|
"""
|
|
@@ -107,8 +106,9 @@ class OpenAI(LlmInterface):
|
|
|
107
106
|
chat = ChatOpenAI(
|
|
108
107
|
api_key=self.config.api_key or os.getenv("OPENAI_API_KEY"),
|
|
109
108
|
model=self.config.model,
|
|
110
|
-
|
|
111
|
-
|
|
109
|
+
streaming=True
|
|
110
|
+
)
|
|
111
|
+
if not (any(self.config.model.startswith(prefix) for prefix in ["gpt-5", "o1", "o3"]) or "search" in self.config.model):
|
|
112
112
|
chat.temperature = self.config.temperature
|
|
113
113
|
chat.streaming = True
|
|
114
114
|
return chat
|
|
@@ -135,8 +135,7 @@ class DeepSeek(LlmInterface):
|
|
|
135
135
|
base_url="https://api.deepseek.com",
|
|
136
136
|
max_tokens=8192,
|
|
137
137
|
temperature=self.config.temperature,
|
|
138
|
-
streaming=True
|
|
139
|
-
stream_usage=True,
|
|
138
|
+
streaming=True
|
|
140
139
|
)
|
|
141
140
|
|
|
142
141
|
def get_models(self):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os, logging
|
|
2
2
|
from ws_bom_robot_app.config import config
|
|
3
3
|
from datetime import datetime, timedelta
|
|
4
|
+
from ws_bom_robot_app.task_manager import task_manager
|
|
4
5
|
|
|
5
6
|
def _cleanup_data_file(folders: list[str], retention: float) -> dict:
|
|
6
7
|
"""
|
|
@@ -72,3 +73,9 @@ def chat_cleanup_attachment() -> dict:
|
|
|
72
73
|
os.path.join(config.robot_data_folder, config.robot_data_attachment_folder)
|
|
73
74
|
]
|
|
74
75
|
return _cleanup_data_file(folders, config.robot_data_attachment_retention_days)
|
|
76
|
+
|
|
77
|
+
def task_cleanup_history() -> None:
|
|
78
|
+
"""
|
|
79
|
+
clean up task queue
|
|
80
|
+
"""
|
|
81
|
+
task_manager.cleanup_task()
|
|
@@ -84,8 +84,6 @@ async def download_file(url: str, destination: str, chunk_size: int = 8192, auth
|
|
|
84
84
|
except OSError:
|
|
85
85
|
pass
|
|
86
86
|
|
|
87
|
-
# ensuse attachment folder exists
|
|
88
|
-
os.makedirs(os.path.join(config.robot_data_folder, config.robot_data_attachment_folder), exist_ok=True)
|
|
89
87
|
class Base64File(BaseModel):
|
|
90
88
|
"""Base64 encoded file representation"""
|
|
91
89
|
url: str
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Union, Optional
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
import os
|
|
1
|
+
import os, copy
|
|
2
|
+
from random import random
|
|
2
3
|
from langchain_core.documents import Document
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
|
-
from unstructured_ingest.
|
|
5
|
-
from unstructured_ingest.
|
|
5
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
6
|
+
from unstructured_ingest.pipeline.pipeline import (
|
|
7
|
+
Pipeline,
|
|
8
|
+
PartitionerConfig,
|
|
9
|
+
FiltererConfig
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.processes.connector_registry import source_registry
|
|
6
12
|
from typing import Union
|
|
7
13
|
from ws_bom_robot_app.llm.utils.secrets import Secrets
|
|
14
|
+
from ws_bom_robot_app.config import config
|
|
8
15
|
|
|
9
16
|
class IntegrationStrategy(ABC):
|
|
10
17
|
@classmethod
|
|
@@ -32,23 +39,58 @@ class IntegrationStrategy(ABC):
|
|
|
32
39
|
pass
|
|
33
40
|
|
|
34
41
|
class UnstructuredIngest():
|
|
42
|
+
_PIPELINE: Pipeline = None
|
|
35
43
|
def __init__(self, working_directory: str):
|
|
36
44
|
self.working_directory = working_directory
|
|
37
|
-
def pipeline(self,
|
|
38
|
-
|
|
39
|
-
|
|
45
|
+
def pipeline(self,indexer_config,downloader_config,connection_config,extension: list[str] = None) -> Pipeline:
|
|
46
|
+
def _default_processor_config() -> ProcessorConfig:
|
|
47
|
+
return ProcessorConfig(
|
|
40
48
|
reprocess=False,
|
|
41
49
|
verbose=False,
|
|
42
50
|
tqdm=False,
|
|
43
|
-
num_processes=
|
|
51
|
+
num_processes=config.robot_ingest_max_threads, #safe choice to 1, avoid potential process-related issues with Docker
|
|
52
|
+
disable_parallelism=False,
|
|
44
53
|
preserve_downloads=True,
|
|
45
54
|
download_only=True,
|
|
46
|
-
raise_on_error=False
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
55
|
+
raise_on_error=False,
|
|
56
|
+
iter_delete=True,
|
|
57
|
+
delete_cache=False #already managed by the generator task
|
|
58
|
+
)
|
|
59
|
+
def _init_pipeline() -> Pipeline:
|
|
60
|
+
return Pipeline.from_configs(
|
|
61
|
+
context=_default_processor_config(),
|
|
62
|
+
indexer_config=indexer_config,
|
|
63
|
+
downloader_config=downloader_config,
|
|
64
|
+
source_connection_config=connection_config,
|
|
65
|
+
partitioner_config=PartitionerConfig(),
|
|
66
|
+
filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
|
|
67
|
+
)
|
|
68
|
+
def _instance_pipeline() -> Pipeline:
|
|
69
|
+
from unstructured_ingest.pipeline.steps.index import IndexStep
|
|
70
|
+
from unstructured_ingest.pipeline.steps.download import DownloadStep
|
|
71
|
+
from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
|
|
72
|
+
_context = _default_processor_config()
|
|
73
|
+
source_entry = {
|
|
74
|
+
k: v
|
|
75
|
+
for k, v in source_registry.items()
|
|
76
|
+
if type(indexer_config) is v.indexer_config
|
|
77
|
+
and type(downloader_config) is v.downloader_config
|
|
78
|
+
and type(connection_config) is v.connection_config
|
|
79
|
+
}
|
|
80
|
+
source = list(source_entry.values())[0]
|
|
81
|
+
_pipeline = copy.deepcopy(UnstructuredIngest._PIPELINE)
|
|
82
|
+
_pipeline.context = _context
|
|
83
|
+
_pipeline.context.work_dir = f"{self.working_directory}_unstructured" # use sibling directory, cleaned up by the generator task
|
|
84
|
+
_pipeline.indexer_step = IndexStep(process=source.indexer(index_config=indexer_config, connection_config=connection_config), context=_context)
|
|
85
|
+
_pipeline.downloader_step = DownloadStep(process=source.downloader(download_config=downloader_config, connection_config=connection_config), context=_context)
|
|
86
|
+
_pipeline.filter_step = FilterStep(process=Filterer(config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)), context=_context) if extension else None
|
|
87
|
+
return _pipeline
|
|
54
88
|
|
|
89
|
+
if not UnstructuredIngest._PIPELINE:
|
|
90
|
+
import random
|
|
91
|
+
import time
|
|
92
|
+
time.sleep(random.uniform(0.2, 1))
|
|
93
|
+
if not UnstructuredIngest._PIPELINE:
|
|
94
|
+
UnstructuredIngest._PIPELINE = _init_pipeline()
|
|
95
|
+
|
|
96
|
+
return _instance_pipeline()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Optional, Union
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Union
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Union, Optional
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional, Union
|
|
3
|
-
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
|
|
4
|
-
from unstructured_ingest.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
4
|
+
from unstructured_ingest.processes.connectors.github import (
|
|
5
|
+
GithubIndexerConfig,
|
|
6
|
+
GithubDownloaderConfig,
|
|
7
|
+
GithubConnectionConfig,
|
|
8
|
+
GithubAccessConfig
|
|
9
|
+
)
|
|
8
10
|
from langchain_core.documents import Document
|
|
9
11
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
10
12
|
from pydantic import BaseModel, Field, AliasChoices
|
|
@@ -27,28 +29,26 @@ class Github(IntegrationStrategy):
|
|
|
27
29
|
def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
|
|
28
30
|
super().__init__(knowledgebase_path, data)
|
|
29
31
|
self.__data = GithubParams.model_validate(self.data)
|
|
32
|
+
self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
|
|
30
33
|
def working_subdirectory(self) -> str:
|
|
31
34
|
return 'github'
|
|
32
35
|
def run(self) -> None:
|
|
33
|
-
|
|
34
|
-
access_token=self.__data.access_token
|
|
35
|
-
)
|
|
36
|
-
file_ext = self.__data.file_ext or None
|
|
37
|
-
file_glob = [f"**/*{ext}" for ext in file_ext] if file_ext else None
|
|
38
|
-
config = SimpleGitHubConfig(
|
|
39
|
-
url = self.__data.repo,
|
|
40
|
-
access_config=access_config,
|
|
36
|
+
indexer_config = GithubIndexerConfig(
|
|
41
37
|
branch=self.__data.branch,
|
|
42
|
-
|
|
38
|
+
recursive=True
|
|
39
|
+
)
|
|
40
|
+
downloader_config = GithubDownloaderConfig(
|
|
41
|
+
download_dir=self.working_directory
|
|
42
|
+
)
|
|
43
|
+
connection_config = GithubConnectionConfig(
|
|
44
|
+
access_config=GithubAccessConfig(access_token=self.__data.access_token),
|
|
45
|
+
url=self.__data.repo
|
|
43
46
|
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
retry_strategy_config=None
|
|
50
|
-
)
|
|
51
|
-
runner.run()
|
|
47
|
+
self.__unstructured_ingest.pipeline(
|
|
48
|
+
indexer_config,
|
|
49
|
+
downloader_config,
|
|
50
|
+
connection_config,
|
|
51
|
+
extension=self.__data.file_ext).run()
|
|
52
52
|
async def load(self) -> list[Document]:
|
|
53
53
|
await asyncio.to_thread(self.run)
|
|
54
54
|
await asyncio.sleep(1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Union
|
|
@@ -1,21 +1,39 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import asyncio, os
|
|
2
|
-
|
|
3
|
+
import sys
|
|
4
|
+
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
5
|
from langchain_core.documents import Document
|
|
4
6
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
5
7
|
from pydantic import BaseModel, Field, AliasChoices
|
|
6
|
-
from typing import Any, Optional, Union
|
|
7
|
-
from unstructured_ingest.
|
|
8
|
-
from unstructured_ingest.
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
from typing import Any, Generator, Iterable, Optional, Union
|
|
9
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
10
|
+
from unstructured_ingest.processes.connectors.jira import (
|
|
11
|
+
JiraIndexerConfig,
|
|
12
|
+
JiraIndexer,
|
|
13
|
+
JiraIssueMetadata,
|
|
14
|
+
api_page_based_generator,
|
|
15
|
+
JiraDownloaderConfig,
|
|
16
|
+
JiraDownloader,
|
|
17
|
+
DEFAULT_C_SEP,
|
|
18
|
+
DEFAULT_R_SEP,
|
|
19
|
+
JiraConnectionConfig,
|
|
20
|
+
JiraAccessConfig
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.pipeline.pipeline import (
|
|
23
|
+
Pipeline,
|
|
24
|
+
PartitionerConfig,
|
|
25
|
+
FiltererConfig
|
|
26
|
+
)
|
|
27
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
11
28
|
|
|
12
29
|
class JiraParams(BaseModel):
|
|
13
30
|
"""
|
|
14
31
|
JiraParams is a Pydantic model that represents the parameters required to interact with a Jira instance.
|
|
32
|
+
Docs: https://docs.unstructured.io/open-source/ingestion/source-connectors/jira#jira
|
|
15
33
|
|
|
16
34
|
Attributes:
|
|
17
35
|
url (str): The URL of the Jira instance, e.g., 'https://example.atlassian.net'.
|
|
18
|
-
access_token (str): The access token for authenticating with the Jira API.
|
|
36
|
+
access_token (str): The access token for authenticating with the Jira API: https://id.atlassian.com/manage-profile/security/api-tokens
|
|
19
37
|
user_email (str): The email address of the Jira user.
|
|
20
38
|
projects (list[str]): A list of project keys or IDs to interact with, e.g., ['SCRUM', 'PROJ1'].
|
|
21
39
|
boards (Optional[list[str]]): An optional list of board IDs to interact with. Defaults to None, e.g., ['1', '2'].
|
|
@@ -27,34 +45,45 @@ class JiraParams(BaseModel):
|
|
|
27
45
|
projects: list[str]
|
|
28
46
|
boards: Optional[list[str]] | None = None
|
|
29
47
|
issues: Optional[list[str]] | None = None
|
|
48
|
+
status_filters: Optional[list[str]] | None = None
|
|
30
49
|
|
|
31
50
|
class Jira(IntegrationStrategy):
|
|
32
51
|
def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
|
|
33
52
|
super().__init__(knowledgebase_path, data)
|
|
34
53
|
self.__data = JiraParams.model_validate(self.data)
|
|
54
|
+
self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
|
|
35
55
|
def working_subdirectory(self) -> str:
|
|
36
56
|
return 'jira'
|
|
37
57
|
def run(self) -> None:
|
|
38
|
-
|
|
39
|
-
api_token=self.__data.access_token
|
|
40
|
-
)
|
|
41
|
-
config = SimpleJiraConfig(
|
|
42
|
-
user_email=self.__data.user_email,
|
|
43
|
-
url = self.__data.url,
|
|
44
|
-
access_config=access_config,
|
|
58
|
+
indexer_config = JiraIndexerConfig(
|
|
45
59
|
projects=self.__data.projects,
|
|
46
60
|
boards=self.__data.boards,
|
|
47
|
-
issues=self.__data.issues
|
|
48
|
-
|
|
49
|
-
# runner override: waiting for v2 migration https://github.com/Unstructured-IO/unstructured-ingest/issues/106
|
|
50
|
-
runner = _JiraRunner(
|
|
51
|
-
connector_config=config,
|
|
52
|
-
processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
|
|
53
|
-
read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
|
|
54
|
-
partition_config=None,
|
|
55
|
-
retry_strategy_config=None
|
|
61
|
+
issues=self.__data.issues,
|
|
62
|
+
status_filters=self.__data.status_filters
|
|
56
63
|
)
|
|
57
|
-
|
|
64
|
+
downloader_config = JiraDownloaderConfig(
|
|
65
|
+
download_dir=self.working_directory,
|
|
66
|
+
download_attachments=False
|
|
67
|
+
)
|
|
68
|
+
_is_cloud = "atlassian.net" in self.__data.url
|
|
69
|
+
_access_config = JiraAccessConfig(token=self.__data.access_token) \
|
|
70
|
+
if not _is_cloud \
|
|
71
|
+
else JiraAccessConfig(password=self.__data.access_token)
|
|
72
|
+
connection_config = JiraConnectionConfig(
|
|
73
|
+
access_config=_access_config,
|
|
74
|
+
username=self.__data.user_email,
|
|
75
|
+
url=self.__data.url,
|
|
76
|
+
cloud=_is_cloud
|
|
77
|
+
)
|
|
78
|
+
pipeline: Pipeline = self.__unstructured_ingest.pipeline(
|
|
79
|
+
indexer_config,
|
|
80
|
+
downloader_config,
|
|
81
|
+
connection_config,
|
|
82
|
+
extension=None)
|
|
83
|
+
if _is_cloud and sys.platform == "win32":
|
|
84
|
+
pipeline.indexer_step.process = CustomJiraIndexer(**vars(pipeline.indexer_step.process))
|
|
85
|
+
pipeline.downloader_step.process = CustomJiraDownloader(**vars(pipeline.downloader_step.process))
|
|
86
|
+
pipeline.run()
|
|
58
87
|
async def load(self) -> list[Document]:
|
|
59
88
|
await asyncio.to_thread(self.run)
|
|
60
89
|
await asyncio.sleep(1)
|
|
@@ -62,8 +91,38 @@ class Jira(IntegrationStrategy):
|
|
|
62
91
|
|
|
63
92
|
|
|
64
93
|
# region override
|
|
65
|
-
class
|
|
66
|
-
|
|
94
|
+
class CustomJiraIndexer(JiraIndexer):
|
|
95
|
+
"""
|
|
96
|
+
fix default run_jql for cloud: missing enhanced_jql
|
|
97
|
+
"""
|
|
98
|
+
import sys
|
|
99
|
+
def __init__(self, **kwargs):
|
|
100
|
+
for key, value in kwargs.items():
|
|
101
|
+
try:
|
|
102
|
+
setattr(super(), key, value)
|
|
103
|
+
except AttributeError:
|
|
104
|
+
setattr(self, key, value)
|
|
105
|
+
def run_jql(self, jql: str, **kwargs) -> Generator[JiraIssueMetadata, None, None]:
|
|
106
|
+
with self.connection_config.get_client() as client:
|
|
107
|
+
for issue in api_page_based_generator(client.jql, jql=jql, **kwargs):
|
|
108
|
+
yield JiraIssueMetadata.model_validate(issue)
|
|
109
|
+
|
|
110
|
+
class CustomJiraDownloader(JiraDownloader):
|
|
111
|
+
CUSTOM_FIELDS: list | None = None
|
|
112
|
+
def _set_custom_fields(self) -> list:
|
|
113
|
+
with self.connection_config.get_client() as client:
|
|
114
|
+
_custom_fields = client.get_all_custom_fields()
|
|
115
|
+
return [{"id": item["id"], "name": item["name"]} for item in _custom_fields]
|
|
116
|
+
def __init__(self, **kwargs):
|
|
117
|
+
for key, value in kwargs.items():
|
|
118
|
+
try:
|
|
119
|
+
setattr(super(), key, value)
|
|
120
|
+
except AttributeError:
|
|
121
|
+
setattr(self, key, value)
|
|
122
|
+
if not self.CUSTOM_FIELDS:
|
|
123
|
+
self.CUSTOM_FIELDS = self._set_custom_fields()
|
|
124
|
+
|
|
125
|
+
def _get_custom_fields_for_issue(self, issue: dict, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP) -> str:
|
|
67
126
|
def _parse_value(value: Any) -> Any:
|
|
68
127
|
if isinstance(value, dict):
|
|
69
128
|
_candidate = ["displayName", "name", "value"]
|
|
@@ -74,45 +133,19 @@ class _JiraIngestDoc(JiraIngestDoc):
|
|
|
74
133
|
def _remap_custom_fields(fields: dict):
|
|
75
134
|
remapped_fields = {}
|
|
76
135
|
for field_key, field_value in fields.items():
|
|
77
|
-
new_key = next((map_item["name"] for map_item in
|
|
136
|
+
new_key = next((map_item["name"] for map_item in self.CUSTOM_FIELDS if field_key == map_item["id"]), field_key)
|
|
78
137
|
if new_key != field_value:
|
|
79
138
|
remapped_fields[new_key] = field_value
|
|
80
139
|
return remapped_fields
|
|
81
140
|
filtered_fields = {key: _parse_value(value) for key, value in issue.items() if value is not None and type(value) not in [list]}
|
|
82
141
|
custom_fields =_remap_custom_fields(filtered_fields)
|
|
83
142
|
return (r_sep + c_sep ).join([f"{key}: {value}{r_sep}" for key, value in custom_fields.items()])
|
|
84
|
-
def __init__(self, *args, **kwargs):
|
|
85
|
-
super().__init__(*args, **kwargs)
|
|
86
|
-
_issue = self.issue
|
|
87
|
-
_nested: dict = nested_object_to_field_getter(_issue["fields"])
|
|
88
|
-
document = "\n\n\n".join(
|
|
89
|
-
[
|
|
90
|
-
_get_id_fields_for_issue(_issue),
|
|
91
|
-
_get_project_fields_for_issue(_nested),
|
|
92
|
-
_JiraIngestDoc._get_dropdown_custom_fields_for_issue(_nested)
|
|
93
|
-
],
|
|
94
|
-
)
|
|
95
|
-
_full_filename = str(self.filename)
|
|
96
|
-
_file_extension = _full_filename.split(".")[-1]
|
|
97
|
-
_file_without_extension = _full_filename.replace(f".{_file_extension}","")
|
|
98
|
-
os.makedirs(os.path.dirname(_file_without_extension), exist_ok=True)
|
|
99
|
-
with open(f"{_file_without_extension}_extra.{_file_extension}", "w", encoding="utf8") as f:
|
|
100
|
-
f.write(document)
|
|
101
|
-
|
|
102
|
-
class _JiraSourceConnector(JiraSourceConnector):
|
|
103
|
-
CUSTOM_FIELDS: list | None = None
|
|
104
|
-
def __set_custom_fields(self) -> None:
|
|
105
|
-
_custom_fields = self.jira.get_all_custom_fields()
|
|
106
|
-
_JiraSourceConnector.CUSTOM_FIELDS = [{"id":item["id"],"name":item["name"]} for item in _custom_fields]
|
|
107
|
-
self._jira = None # fix serialization
|
|
108
|
-
def __init__(self, *args, **kwargs):
|
|
109
|
-
super().__init__(*args, **kwargs)
|
|
110
|
-
if not _JiraSourceConnector.CUSTOM_FIELDS:
|
|
111
|
-
self.__set_custom_fields()
|
|
112
|
-
def get_ingest_docs(self) -> list[_JiraIngestDoc]:
|
|
113
|
-
return [_JiraIngestDoc(**item.__dict__) for item in super().get_ingest_docs()]
|
|
114
143
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
144
|
+
def _get_text_fields_for_issue(self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP) -> str:
|
|
145
|
+
#no need any more: original data will be included in the custom fields
|
|
146
|
+
#_origin = super()._get_text_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
|
|
147
|
+
_custom_fields = self._get_custom_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
|
|
148
|
+
return f"""Details:
|
|
149
|
+
{r_sep}
|
|
150
|
+
{_custom_fields}"""
|
|
118
151
|
# endregion
|
|
@@ -13,6 +13,7 @@ from ws_bom_robot_app.llm.vector_store.integration.sharepoint import Sharepoint
|
|
|
13
13
|
from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
|
|
14
14
|
from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
|
|
15
15
|
from ws_bom_robot_app.llm.vector_store.integration.thron import Thron
|
|
16
|
+
from ws_bom_robot_app.llm.vector_store.integration.shopify import Shopify
|
|
16
17
|
class IntegrationManager:
|
|
17
18
|
_list: dict[str, Type[IntegrationStrategy]] = {
|
|
18
19
|
"llmkbazure": Azure,
|
|
@@ -28,6 +29,7 @@ class IntegrationManager:
|
|
|
28
29
|
"llmkbsitemap": Sitemap,
|
|
29
30
|
"llmkbslack": Slack,
|
|
30
31
|
"llmkbthron": Thron,
|
|
32
|
+
"llmkbshopify": Shopify,
|
|
31
33
|
}
|
|
32
34
|
@classmethod
|
|
33
35
|
def get_strategy(cls, name: str, knowledgebase_path: str, data: dict[str, str]) -> IntegrationStrategy:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Union, Optional
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
|
|
4
4
|
from langchain_core.documents import Document
|
|
5
5
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
6
|
from typing import Union, Optional
|