ws-bom-robot-app 0.0.80__py3-none-any.whl → 0.0.82__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ws_bom_robot_app/config.py +10 -0
  2. ws_bom_robot_app/cron_manager.py +6 -6
  3. ws_bom_robot_app/llm/api.py +2 -2
  4. ws_bom_robot_app/llm/providers/llm_manager.py +5 -6
  5. ws_bom_robot_app/llm/utils/cleanup.py +7 -0
  6. ws_bom_robot_app/llm/utils/download.py +0 -2
  7. ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
  8. ws_bom_robot_app/llm/vector_store/integration/base.py +57 -15
  9. ws_bom_robot_app/llm/vector_store/integration/confluence.py +1 -1
  10. ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
  11. ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
  12. ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
  13. ws_bom_robot_app/llm/vector_store/integration/googledrive.py +1 -1
  14. ws_bom_robot_app/llm/vector_store/integration/jira.py +93 -60
  15. ws_bom_robot_app/llm/vector_store/integration/manager.py +2 -0
  16. ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
  17. ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
  18. ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
  19. ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
  20. ws_bom_robot_app/llm/vector_store/integration/sitemap.py +3 -0
  21. ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
  22. ws_bom_robot_app/llm/vector_store/integration/thron.py +2 -3
  23. ws_bom_robot_app/llm/vector_store/loader/base.py +8 -6
  24. ws_bom_robot_app/llm/vector_store/loader/docling.py +1 -1
  25. ws_bom_robot_app/subprocess_runner.py +103 -0
  26. ws_bom_robot_app/task_manager.py +169 -41
  27. {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/METADATA +18 -8
  28. {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/RECORD +30 -28
  29. {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/WHEEL +0 -0
  30. {ws_bom_robot_app-0.0.80.dist-info → ws_bom_robot_app-0.0.82.dist-info}/top_level.txt +0 -0
@@ -16,9 +16,14 @@ class Settings(BaseSettings):
16
16
  robot_data_db_retention_days: float = 60
17
17
  robot_data_attachment_folder: str = 'attachment'
18
18
  robot_data_attachment_retention_days: float = 1
19
+ robot_ingest_max_threads: int = 1 # safe choice to 1, avoid potential process-related issues with Docker
19
20
  robot_loader_max_threads: int = 1
20
21
  robot_task_max_total_parallelism: int = 2 * (os.cpu_count() or 1)
21
22
  robot_task_retention_days: float = 1
23
+ robot_task_strategy: str = 'memory' # memory / db
24
+ robot_task_mp_enable: bool = True
25
+ robot_task_mp_method: str = 'spawn' # spawn / fork
26
+ robot_cron_strategy: str = 'memory' # memory / db
22
27
  robot_cms_host: str = ''
23
28
  robot_cms_auth: str = ''
24
29
  robot_cms_db_folder: str = 'llmVectorDb'
@@ -41,6 +46,7 @@ class Settings(BaseSettings):
41
46
  )
42
47
  def __init__(self, **kwargs):
43
48
  super().__init__(**kwargs)
49
+ # env
44
50
  os.environ["USER_AGENT"] = self.USER_AGENT
45
51
  os.environ["OPENAI_API_KEY"] = self.OPENAI_API_KEY
46
52
  os.environ["OLLAMA_API_URL"] = self.OLLAMA_API_URL
@@ -53,6 +59,10 @@ class Settings(BaseSettings):
53
59
  os.environ["WATSONX_APIKEY"] = self.WATSONX_APIKEY
54
60
  os.environ["WATSONX_PROJECTID"] = self.WATSONX_PROJECTID
55
61
  os.environ["NEBULY_API_URL"] = self.NEBULY_API_URL
62
+ # dir
63
+ os.makedirs(self.robot_data_folder, exist_ok=True)
64
+ for subfolder in [self.robot_data_db_folder, self.robot_data_attachment_folder, 'db']:
65
+ os.makedirs(os.path.join(self.robot_data_folder, subfolder), exist_ok=True)
56
66
 
57
67
  class RuntimeOptions(BaseModel):
58
68
  @staticmethod
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from apscheduler.schedulers.background import BackgroundScheduler
2
3
  #from apscheduler.schedulers.asyncio import AsyncIOScheduler
3
4
  from apscheduler.jobstores.memory import MemoryJobStore
@@ -7,8 +8,7 @@ from apscheduler.triggers.interval import IntervalTrigger
7
8
  from apscheduler.triggers.date import DateTrigger
8
9
  from fastapi import APIRouter
9
10
  from datetime import datetime
10
- from ws_bom_robot_app.task_manager import task_manager
11
- from ws_bom_robot_app.llm.utils.cleanup import kb_cleanup_data_file, chat_cleanup_attachment
11
+ from ws_bom_robot_app.llm.utils.cleanup import kb_cleanup_data_file, chat_cleanup_attachment, task_cleanup_history
12
12
  from ws_bom_robot_app.util import _log
13
13
  from ws_bom_robot_app.config import config
14
14
 
@@ -22,8 +22,8 @@ class MemoryJobstoreStrategy(JobstoreStrategy):
22
22
  return {"default": MemoryJobStore()}
23
23
 
24
24
  class PersistentJobstoreStrategy(JobstoreStrategy):
25
- def get_jobstore(self, db_url: str = "sqlite:///.data/db/jobs.sqlite"):
26
- _log.info(f"Using persistent crob jobstore with database URL: {db_url}.")
25
+ def get_jobstore(self, db_url: str = f"sqlite:///{config.robot_data_folder}/db/jobs.sqlite"):
26
+ _log.info(f"Using persistent cron jobstore with database URL: {db_url}.")
27
27
  return {"default": SQLAlchemyJobStore(url=db_url)}
28
28
 
29
29
  class Job:
@@ -56,12 +56,12 @@ class Job:
56
56
 
57
57
  class CronManager:
58
58
  _list_default = [
59
- Job('cleanup-task',task_manager.cleanup_task, interval=5 * 60),
59
+ Job('cleanup-task-history',task_cleanup_history, interval=5 * 60),
60
60
  Job('cleanup-kb-data',kb_cleanup_data_file, interval=180 * 60),
61
61
  Job('cleanup-chat-attachment',chat_cleanup_attachment, interval=120 * 60),
62
62
  ]
63
63
  def __get_jobstore_strategy(self) -> JobstoreStrategy:
64
- if True or config.runtime_options().is_multi_process:
64
+ if config.robot_cron_strategy == 'memory':
65
65
  return MemoryJobstoreStrategy()
66
66
  return PersistentJobstoreStrategy()
67
67
  def __init__(self, strategy: JobstoreStrategy = None, enable_defaults: bool = True):
@@ -52,7 +52,7 @@ async def _kb(rq: KbRequest) -> VectorDbResponse:
52
52
 
53
53
  @router.post("/kb/task")
54
54
  async def _kb_task(rq: KbRequest, headers: Annotated[TaskHeader, Header()]) -> IdentifiableEntity:
55
- return task_manager.create_task(kb(rq),headers)
55
+ return task_manager.create_task(lambda: kb(rq),headers)
56
56
 
57
57
  @router.post("/rules")
58
58
  async def _rules(rq: RulesRequest) -> VectorDbResponse:
@@ -60,7 +60,7 @@ async def _rules(rq: RulesRequest) -> VectorDbResponse:
60
60
 
61
61
  @router.post("/rules/task")
62
62
  async def _rules_task(rq: RulesRequest, headers: Annotated[TaskHeader, Header()]) -> IdentifiableEntity:
63
- return task_manager.create_task(rules(rq),headers)
63
+ return task_manager.create_task(lambda: rules(rq), headers)
64
64
 
65
65
  @router.get("/kb/file/{filename}")
66
66
  async def _kb_get_file(filename: str) -> StreamingResponse:
@@ -69,8 +69,7 @@ class Anthropic(LlmInterface):
69
69
  model=self.config.model,
70
70
  temperature=self.config.temperature,
71
71
  max_tokens=8192,
72
- streaming=True,
73
- stream_usage=True
72
+ streaming=True
74
73
  )
75
74
 
76
75
  """
@@ -107,8 +106,9 @@ class OpenAI(LlmInterface):
107
106
  chat = ChatOpenAI(
108
107
  api_key=self.config.api_key or os.getenv("OPENAI_API_KEY"),
109
108
  model=self.config.model,
110
- stream_usage=True)
111
- if not (any(self.config.model.startswith(prefix) for prefix in ["o1", "o3"]) or "search" in self.config.model):
109
+ streaming=True
110
+ )
111
+ if not (any(self.config.model.startswith(prefix) for prefix in ["gpt-5", "o1", "o3"]) or "search" in self.config.model):
112
112
  chat.temperature = self.config.temperature
113
113
  chat.streaming = True
114
114
  return chat
@@ -135,8 +135,7 @@ class DeepSeek(LlmInterface):
135
135
  base_url="https://api.deepseek.com",
136
136
  max_tokens=8192,
137
137
  temperature=self.config.temperature,
138
- streaming=True,
139
- stream_usage=True,
138
+ streaming=True
140
139
  )
141
140
 
142
141
  def get_models(self):
@@ -1,6 +1,7 @@
1
1
  import os, logging
2
2
  from ws_bom_robot_app.config import config
3
3
  from datetime import datetime, timedelta
4
+ from ws_bom_robot_app.task_manager import task_manager
4
5
 
5
6
  def _cleanup_data_file(folders: list[str], retention: float) -> dict:
6
7
  """
@@ -72,3 +73,9 @@ def chat_cleanup_attachment() -> dict:
72
73
  os.path.join(config.robot_data_folder, config.robot_data_attachment_folder)
73
74
  ]
74
75
  return _cleanup_data_file(folders, config.robot_data_attachment_retention_days)
76
+
77
+ def task_cleanup_history() -> None:
78
+ """
79
+ clean up task queue
80
+ """
81
+ task_manager.cleanup_task()
@@ -84,8 +84,6 @@ async def download_file(url: str, destination: str, chunk_size: int = 8192, auth
84
84
  except OSError:
85
85
  pass
86
86
 
87
- # ensuse attachment folder exists
88
- os.makedirs(os.path.join(config.robot_data_folder, config.robot_data_attachment_folder), exist_ok=True)
89
87
  class Base64File(BaseModel):
90
88
  """Base64 encoded file representation"""
91
89
  url: str
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,10 +1,17 @@
1
- import os
1
+ import os, copy
2
+ from random import random
2
3
  from langchain_core.documents import Document
3
4
  from abc import ABC, abstractmethod
4
- from unstructured_ingest.v2.interfaces import ProcessorConfig
5
- from unstructured_ingest.v2.pipeline.pipeline import Pipeline, PartitionerConfig, FiltererConfig
5
+ from unstructured_ingest.interfaces import ProcessorConfig
6
+ from unstructured_ingest.pipeline.pipeline import (
7
+ Pipeline,
8
+ PartitionerConfig,
9
+ FiltererConfig
10
+ )
11
+ from unstructured_ingest.processes.connector_registry import source_registry
6
12
  from typing import Union
7
13
  from ws_bom_robot_app.llm.utils.secrets import Secrets
14
+ from ws_bom_robot_app.config import config
8
15
 
9
16
  class IntegrationStrategy(ABC):
10
17
  @classmethod
@@ -32,23 +39,58 @@ class IntegrationStrategy(ABC):
32
39
  pass
33
40
 
34
41
  class UnstructuredIngest():
42
+ _PIPELINE: Pipeline = None
35
43
  def __init__(self, working_directory: str):
36
44
  self.working_directory = working_directory
37
- def pipeline(self,indexer,downloader,connection,extension: list[str] = None) -> Pipeline:
38
- return Pipeline.from_configs(
39
- context=ProcessorConfig(
45
+ def pipeline(self,indexer_config,downloader_config,connection_config,extension: list[str] = None) -> Pipeline:
46
+ def _default_processor_config() -> ProcessorConfig:
47
+ return ProcessorConfig(
40
48
  reprocess=False,
41
49
  verbose=False,
42
50
  tqdm=False,
43
- num_processes=2,
51
+ num_processes=config.robot_ingest_max_threads, #safe choice to 1, avoid potential process-related issues with Docker
52
+ disable_parallelism=False,
44
53
  preserve_downloads=True,
45
54
  download_only=True,
46
- raise_on_error=False
47
- ),
48
- indexer_config=indexer,
49
- downloader_config=downloader,
50
- source_connection_config=connection,
51
- partitioner_config=PartitionerConfig(),
52
- filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
53
- )
55
+ raise_on_error=False,
56
+ iter_delete=True,
57
+ delete_cache=False #already managed by the generator task
58
+ )
59
+ def _init_pipeline() -> Pipeline:
60
+ return Pipeline.from_configs(
61
+ context=_default_processor_config(),
62
+ indexer_config=indexer_config,
63
+ downloader_config=downloader_config,
64
+ source_connection_config=connection_config,
65
+ partitioner_config=PartitionerConfig(),
66
+ filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
67
+ )
68
+ def _instance_pipeline() -> Pipeline:
69
+ from unstructured_ingest.pipeline.steps.index import IndexStep
70
+ from unstructured_ingest.pipeline.steps.download import DownloadStep
71
+ from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
72
+ _context = _default_processor_config()
73
+ source_entry = {
74
+ k: v
75
+ for k, v in source_registry.items()
76
+ if type(indexer_config) is v.indexer_config
77
+ and type(downloader_config) is v.downloader_config
78
+ and type(connection_config) is v.connection_config
79
+ }
80
+ source = list(source_entry.values())[0]
81
+ _pipeline = copy.deepcopy(UnstructuredIngest._PIPELINE)
82
+ _pipeline.context = _context
83
+ _pipeline.context.work_dir = f"{self.working_directory}_unstructured" # use sibling directory, cleaned up by the generator task
84
+ _pipeline.indexer_step = IndexStep(process=source.indexer(index_config=indexer_config, connection_config=connection_config), context=_context)
85
+ _pipeline.downloader_step = DownloadStep(process=source.downloader(download_config=downloader_config, connection_config=connection_config), context=_context)
86
+ _pipeline.filter_step = FilterStep(process=Filterer(config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)), context=_context) if extension else None
87
+ return _pipeline
54
88
 
89
+ if not UnstructuredIngest._PIPELINE:
90
+ import random
91
+ import time
92
+ time.sleep(random.uniform(0.2, 1))
93
+ if not UnstructuredIngest._PIPELINE:
94
+ UnstructuredIngest._PIPELINE = _init_pipeline()
95
+
96
+ return _instance_pipeline()
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
3
+ from unstructured_ingest.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Optional, Union
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,10 +1,12 @@
1
1
  import asyncio
2
2
  from typing import Optional, Union
3
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
4
- from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
5
- from unstructured_ingest.connector.git import GitAccessConfig
6
- from unstructured_ingest.connector.github import SimpleGitHubConfig
7
- from unstructured_ingest.runner import GithubRunner
3
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
4
+ from unstructured_ingest.processes.connectors.github import (
5
+ GithubIndexerConfig,
6
+ GithubDownloaderConfig,
7
+ GithubConnectionConfig,
8
+ GithubAccessConfig
9
+ )
8
10
  from langchain_core.documents import Document
9
11
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
10
12
  from pydantic import BaseModel, Field, AliasChoices
@@ -27,28 +29,26 @@ class Github(IntegrationStrategy):
27
29
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
28
30
  super().__init__(knowledgebase_path, data)
29
31
  self.__data = GithubParams.model_validate(self.data)
32
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
30
33
  def working_subdirectory(self) -> str:
31
34
  return 'github'
32
35
  def run(self) -> None:
33
- access_config = GitAccessConfig(
34
- access_token=self.__data.access_token
35
- )
36
- file_ext = self.__data.file_ext or None
37
- file_glob = [f"**/*{ext}" for ext in file_ext] if file_ext else None
38
- config = SimpleGitHubConfig(
39
- url = self.__data.repo,
40
- access_config=access_config,
36
+ indexer_config = GithubIndexerConfig(
41
37
  branch=self.__data.branch,
42
- file_glob=file_glob
38
+ recursive=True
39
+ )
40
+ downloader_config = GithubDownloaderConfig(
41
+ download_dir=self.working_directory
42
+ )
43
+ connection_config = GithubConnectionConfig(
44
+ access_config=GithubAccessConfig(access_token=self.__data.access_token),
45
+ url=self.__data.repo
43
46
  )
44
- runner = GithubRunner(
45
- connector_config=config,
46
- processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
47
- read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
48
- partition_config=None,
49
- retry_strategy_config=None
50
- )
51
- runner.run()
47
+ self.__unstructured_ingest.pipeline(
48
+ indexer_config,
49
+ downloader_config,
50
+ connection_config,
51
+ extension=self.__data.file_ext).run()
52
52
  async def load(self) -> list[Document]:
53
53
  await asyncio.to_thread(self.run)
54
54
  await asyncio.sleep(1)
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
3
+ from unstructured_ingest.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union
@@ -1,21 +1,39 @@
1
+ import logging
1
2
  import asyncio, os
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ import sys
4
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
5
  from langchain_core.documents import Document
4
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
5
7
  from pydantic import BaseModel, Field, AliasChoices
6
- from typing import Any, Optional, Union
7
- from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
8
- from unstructured_ingest.connector.jira import SimpleJiraConfig, JiraAccessConfig, JiraSourceConnector, JiraIngestDoc, nested_object_to_field_getter, _get_id_fields_for_issue, _get_project_fields_for_issue
9
- from unstructured_ingest.runner import JiraRunner
10
-
8
+ from typing import Any, Generator, Iterable, Optional, Union
9
+ from unstructured_ingest.pipeline.pipeline import Pipeline
10
+ from unstructured_ingest.processes.connectors.jira import (
11
+ JiraIndexerConfig,
12
+ JiraIndexer,
13
+ JiraIssueMetadata,
14
+ api_page_based_generator,
15
+ JiraDownloaderConfig,
16
+ JiraDownloader,
17
+ DEFAULT_C_SEP,
18
+ DEFAULT_R_SEP,
19
+ JiraConnectionConfig,
20
+ JiraAccessConfig
21
+ )
22
+ from unstructured_ingest.pipeline.pipeline import (
23
+ Pipeline,
24
+ PartitionerConfig,
25
+ FiltererConfig
26
+ )
27
+ from unstructured_ingest.interfaces import ProcessorConfig
11
28
 
12
29
  class JiraParams(BaseModel):
13
30
  """
14
31
  JiraParams is a Pydantic model that represents the parameters required to interact with a Jira instance.
32
+ Docs: https://docs.unstructured.io/open-source/ingestion/source-connectors/jira#jira
15
33
 
16
34
  Attributes:
17
35
  url (str): The URL of the Jira instance, e.g., 'https://example.atlassian.net'.
18
- access_token (str): The access token for authenticating with the Jira API.
36
+ access_token (str): The access token for authenticating with the Jira API: https://id.atlassian.com/manage-profile/security/api-tokens
19
37
  user_email (str): The email address of the Jira user.
20
38
  projects (list[str]): A list of project keys or IDs to interact with, e.g., ['SCRUM', 'PROJ1'].
21
39
  boards (Optional[list[str]]): An optional list of board IDs to interact with. Defaults to None, e.g., ['1', '2'].
@@ -27,34 +45,45 @@ class JiraParams(BaseModel):
27
45
  projects: list[str]
28
46
  boards: Optional[list[str]] | None = None
29
47
  issues: Optional[list[str]] | None = None
48
+ status_filters: Optional[list[str]] | None = None
30
49
 
31
50
  class Jira(IntegrationStrategy):
32
51
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
33
52
  super().__init__(knowledgebase_path, data)
34
53
  self.__data = JiraParams.model_validate(self.data)
54
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
35
55
  def working_subdirectory(self) -> str:
36
56
  return 'jira'
37
57
  def run(self) -> None:
38
- access_config = JiraAccessConfig(
39
- api_token=self.__data.access_token
40
- )
41
- config = SimpleJiraConfig(
42
- user_email=self.__data.user_email,
43
- url = self.__data.url,
44
- access_config=access_config,
58
+ indexer_config = JiraIndexerConfig(
45
59
  projects=self.__data.projects,
46
60
  boards=self.__data.boards,
47
- issues=self.__data.issues
48
- )
49
- # runner override: waiting for v2 migration https://github.com/Unstructured-IO/unstructured-ingest/issues/106
50
- runner = _JiraRunner(
51
- connector_config=config,
52
- processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
53
- read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
54
- partition_config=None,
55
- retry_strategy_config=None
61
+ issues=self.__data.issues,
62
+ status_filters=self.__data.status_filters
56
63
  )
57
- runner.run()
64
+ downloader_config = JiraDownloaderConfig(
65
+ download_dir=self.working_directory,
66
+ download_attachments=False
67
+ )
68
+ _is_cloud = "atlassian.net" in self.__data.url
69
+ _access_config = JiraAccessConfig(token=self.__data.access_token) \
70
+ if not _is_cloud \
71
+ else JiraAccessConfig(password=self.__data.access_token)
72
+ connection_config = JiraConnectionConfig(
73
+ access_config=_access_config,
74
+ username=self.__data.user_email,
75
+ url=self.__data.url,
76
+ cloud=_is_cloud
77
+ )
78
+ pipeline: Pipeline = self.__unstructured_ingest.pipeline(
79
+ indexer_config,
80
+ downloader_config,
81
+ connection_config,
82
+ extension=None)
83
+ if _is_cloud and sys.platform == "win32":
84
+ pipeline.indexer_step.process = CustomJiraIndexer(**vars(pipeline.indexer_step.process))
85
+ pipeline.downloader_step.process = CustomJiraDownloader(**vars(pipeline.downloader_step.process))
86
+ pipeline.run()
58
87
  async def load(self) -> list[Document]:
59
88
  await asyncio.to_thread(self.run)
60
89
  await asyncio.sleep(1)
@@ -62,8 +91,38 @@ class Jira(IntegrationStrategy):
62
91
 
63
92
 
64
93
  # region override
65
- class _JiraIngestDoc(JiraIngestDoc):
66
- def _get_dropdown_custom_fields_for_issue(issue: dict, c_sep=" " * 5, r_sep="\n") -> str:
94
+ class CustomJiraIndexer(JiraIndexer):
95
+ """
96
+ fix default run_jql for cloud: missing enhanced_jql
97
+ """
98
+ import sys
99
+ def __init__(self, **kwargs):
100
+ for key, value in kwargs.items():
101
+ try:
102
+ setattr(super(), key, value)
103
+ except AttributeError:
104
+ setattr(self, key, value)
105
+ def run_jql(self, jql: str, **kwargs) -> Generator[JiraIssueMetadata, None, None]:
106
+ with self.connection_config.get_client() as client:
107
+ for issue in api_page_based_generator(client.jql, jql=jql, **kwargs):
108
+ yield JiraIssueMetadata.model_validate(issue)
109
+
110
+ class CustomJiraDownloader(JiraDownloader):
111
+ CUSTOM_FIELDS: list | None = None
112
+ def _set_custom_fields(self) -> list:
113
+ with self.connection_config.get_client() as client:
114
+ _custom_fields = client.get_all_custom_fields()
115
+ return [{"id": item["id"], "name": item["name"]} for item in _custom_fields]
116
+ def __init__(self, **kwargs):
117
+ for key, value in kwargs.items():
118
+ try:
119
+ setattr(super(), key, value)
120
+ except AttributeError:
121
+ setattr(self, key, value)
122
+ if not self.CUSTOM_FIELDS:
123
+ self.CUSTOM_FIELDS = self._set_custom_fields()
124
+
125
+ def _get_custom_fields_for_issue(self, issue: dict, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP) -> str:
67
126
  def _parse_value(value: Any) -> Any:
68
127
  if isinstance(value, dict):
69
128
  _candidate = ["displayName", "name", "value"]
@@ -74,45 +133,19 @@ class _JiraIngestDoc(JiraIngestDoc):
74
133
  def _remap_custom_fields(fields: dict):
75
134
  remapped_fields = {}
76
135
  for field_key, field_value in fields.items():
77
- new_key = next((map_item["name"] for map_item in _JiraSourceConnector.CUSTOM_FIELDS if field_key == map_item["id"]), field_key)
136
+ new_key = next((map_item["name"] for map_item in self.CUSTOM_FIELDS if field_key == map_item["id"]), field_key)
78
137
  if new_key != field_value:
79
138
  remapped_fields[new_key] = field_value
80
139
  return remapped_fields
81
140
  filtered_fields = {key: _parse_value(value) for key, value in issue.items() if value is not None and type(value) not in [list]}
82
141
  custom_fields =_remap_custom_fields(filtered_fields)
83
142
  return (r_sep + c_sep ).join([f"{key}: {value}{r_sep}" for key, value in custom_fields.items()])
84
- def __init__(self, *args, **kwargs):
85
- super().__init__(*args, **kwargs)
86
- _issue = self.issue
87
- _nested: dict = nested_object_to_field_getter(_issue["fields"])
88
- document = "\n\n\n".join(
89
- [
90
- _get_id_fields_for_issue(_issue),
91
- _get_project_fields_for_issue(_nested),
92
- _JiraIngestDoc._get_dropdown_custom_fields_for_issue(_nested)
93
- ],
94
- )
95
- _full_filename = str(self.filename)
96
- _file_extension = _full_filename.split(".")[-1]
97
- _file_without_extension = _full_filename.replace(f".{_file_extension}","")
98
- os.makedirs(os.path.dirname(_file_without_extension), exist_ok=True)
99
- with open(f"{_file_without_extension}_extra.{_file_extension}", "w", encoding="utf8") as f:
100
- f.write(document)
101
-
102
- class _JiraSourceConnector(JiraSourceConnector):
103
- CUSTOM_FIELDS: list | None = None
104
- def __set_custom_fields(self) -> None:
105
- _custom_fields = self.jira.get_all_custom_fields()
106
- _JiraSourceConnector.CUSTOM_FIELDS = [{"id":item["id"],"name":item["name"]} for item in _custom_fields]
107
- self._jira = None # fix serialization
108
- def __init__(self, *args, **kwargs):
109
- super().__init__(*args, **kwargs)
110
- if not _JiraSourceConnector.CUSTOM_FIELDS:
111
- self.__set_custom_fields()
112
- def get_ingest_docs(self) -> list[_JiraIngestDoc]:
113
- return [_JiraIngestDoc(**item.__dict__) for item in super().get_ingest_docs()]
114
143
 
115
- class _JiraRunner(JiraRunner):
116
- def get_source_connector_cls(self):
117
- return _JiraSourceConnector
144
+ def _get_text_fields_for_issue(self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP) -> str:
145
+ #no need any more: original data will be included in the custom fields
146
+ #_origin = super()._get_text_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
147
+ _custom_fields = self._get_custom_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
148
+ return f"""Details:
149
+ {r_sep}
150
+ {_custom_fields}"""
118
151
  # endregion
@@ -13,6 +13,7 @@ from ws_bom_robot_app.llm.vector_store.integration.sharepoint import Sharepoint
13
13
  from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
14
14
  from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
15
15
  from ws_bom_robot_app.llm.vector_store.integration.thron import Thron
16
+ from ws_bom_robot_app.llm.vector_store.integration.shopify import Shopify
16
17
  class IntegrationManager:
17
18
  _list: dict[str, Type[IntegrationStrategy]] = {
18
19
  "llmkbazure": Azure,
@@ -28,6 +29,7 @@ class IntegrationManager:
28
29
  "llmkbsitemap": Sitemap,
29
30
  "llmkbslack": Slack,
30
31
  "llmkbthron": Thron,
32
+ "llmkbshopify": Shopify,
31
33
  }
32
34
  @classmethod
33
35
  def get_strategy(cls, name: str, knowledgebase_path: str, data: dict[str, str]) -> IntegrationStrategy:
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional