ws-bom-robot-app 0.0.37__py3-none-any.whl → 0.0.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. ws_bom_robot_app/config.py +35 -7
  2. ws_bom_robot_app/cron_manager.py +15 -14
  3. ws_bom_robot_app/llm/agent_context.py +26 -0
  4. ws_bom_robot_app/llm/agent_description.py +123 -123
  5. ws_bom_robot_app/llm/agent_handler.py +176 -180
  6. ws_bom_robot_app/llm/agent_lcel.py +107 -54
  7. ws_bom_robot_app/llm/api.py +100 -7
  8. ws_bom_robot_app/llm/defaut_prompt.py +15 -15
  9. ws_bom_robot_app/llm/evaluator.py +319 -0
  10. ws_bom_robot_app/llm/feedbacks/__init__.py +0 -0
  11. ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -0
  12. ws_bom_robot_app/llm/main.py +159 -110
  13. ws_bom_robot_app/llm/models/api.py +70 -5
  14. ws_bom_robot_app/llm/models/feedback.py +30 -0
  15. ws_bom_robot_app/llm/nebuly_handler.py +185 -0
  16. ws_bom_robot_app/llm/providers/llm_manager.py +244 -80
  17. ws_bom_robot_app/llm/tools/models/main.py +8 -0
  18. ws_bom_robot_app/llm/tools/tool_builder.py +68 -23
  19. ws_bom_robot_app/llm/tools/tool_manager.py +343 -133
  20. ws_bom_robot_app/llm/tools/utils.py +41 -25
  21. ws_bom_robot_app/llm/utils/agent.py +34 -0
  22. ws_bom_robot_app/llm/utils/chunker.py +6 -1
  23. ws_bom_robot_app/llm/utils/cleanup.py +81 -0
  24. ws_bom_robot_app/llm/utils/cms.py +123 -0
  25. ws_bom_robot_app/llm/utils/download.py +183 -79
  26. ws_bom_robot_app/llm/utils/print.py +29 -29
  27. ws_bom_robot_app/llm/vector_store/db/__init__.py +0 -0
  28. ws_bom_robot_app/llm/vector_store/db/base.py +193 -0
  29. ws_bom_robot_app/llm/vector_store/db/chroma.py +97 -0
  30. ws_bom_robot_app/llm/vector_store/db/faiss.py +91 -0
  31. ws_bom_robot_app/llm/vector_store/db/manager.py +15 -0
  32. ws_bom_robot_app/llm/vector_store/db/qdrant.py +73 -0
  33. ws_bom_robot_app/llm/vector_store/generator.py +137 -137
  34. ws_bom_robot_app/llm/vector_store/integration/api.py +216 -0
  35. ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
  36. ws_bom_robot_app/llm/vector_store/integration/base.py +58 -15
  37. ws_bom_robot_app/llm/vector_store/integration/confluence.py +41 -11
  38. ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
  39. ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
  40. ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
  41. ws_bom_robot_app/llm/vector_store/integration/googledrive.py +46 -17
  42. ws_bom_robot_app/llm/vector_store/integration/jira.py +112 -75
  43. ws_bom_robot_app/llm/vector_store/integration/manager.py +6 -2
  44. ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
  45. ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
  46. ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
  47. ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
  48. ws_bom_robot_app/llm/vector_store/integration/sitemap.py +9 -1
  49. ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
  50. ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -0
  51. ws_bom_robot_app/llm/vector_store/loader/base.py +52 -8
  52. ws_bom_robot_app/llm/vector_store/loader/docling.py +71 -33
  53. ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
  54. ws_bom_robot_app/main.py +148 -146
  55. ws_bom_robot_app/subprocess_runner.py +106 -0
  56. ws_bom_robot_app/task_manager.py +207 -54
  57. ws_bom_robot_app/util.py +65 -20
  58. ws_bom_robot_app-0.0.103.dist-info/METADATA +364 -0
  59. ws_bom_robot_app-0.0.103.dist-info/RECORD +76 -0
  60. {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/WHEEL +1 -1
  61. ws_bom_robot_app/llm/settings.py +0 -4
  62. ws_bom_robot_app/llm/utils/agent_utils.py +0 -17
  63. ws_bom_robot_app/llm/utils/kb.py +0 -34
  64. ws_bom_robot_app-0.0.37.dist-info/METADATA +0 -277
  65. ws_bom_robot_app-0.0.37.dist-info/RECORD +0 -60
  66. {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
3
+ from unstructured_ingest.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceIndexer, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
4
+ from unstructured_ingest.pipeline.pipeline import Pipeline
4
5
  from langchain_core.documents import Document
5
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
- from typing import Union
7
+ from typing import List, Optional, Union
7
8
  from pydantic import BaseModel, Field, AliasChoices
8
9
 
9
10
  class ConfluenceParams(BaseModel):
@@ -12,15 +13,19 @@ class ConfluenceParams(BaseModel):
12
13
 
13
14
  Attributes:
14
15
  url (str): The URL of the Confluence instance, e.g., 'https://example.atlassian.net'.
15
- access_token (str): The access token for authenticating with Confluence, e.g., 'AT....'
16
- user_email (str): The email address of the Confluence user
16
+ username (str): The email address or username of the Confluence user
17
+ password: Confluence password or Cloud API token, if filled, set the access_token to None and vice versa.
18
+ access_token (str): The personal access token for authenticating with Confluence, e.g., 'AT....'
17
19
  spaces (list[str]): A list of Confluence spaces to interact with, e.g., ['SPACE1', 'SPACE2'].
20
+ max_num_of_docs_from_each_space (int): The maximum number of documents to fetch from each space. Defaults to 500, with a maximum limit of 5000.
18
21
  extension (list[str], optional): A list of file extensions to filter by. Defaults to None, e.g., ['.pdf', '.docx'].
19
22
  """
20
23
  url: str
21
- access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
22
- user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
24
+ username: str = Field(validation_alias=AliasChoices("userName","userEmail","username"))
25
+ password: Optional[str] = None
26
+ access_token: Optional[str] = Field(None, validation_alias=AliasChoices("accessToken","access_token"))
23
27
  spaces: list[str] = []
28
+ max_num_of_docs_from_each_space: int = Field(default=500, ge=1, le=5000,validation_alias=AliasChoices("maxNumOfDocsFromEachSpace","max_num_of_docs_from_each_space"))
24
29
  extension: list[str] = Field(default=None)
25
30
  class Confluence(IntegrationStrategy):
26
31
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
@@ -31,23 +36,48 @@ class Confluence(IntegrationStrategy):
31
36
  return 'confluence'
32
37
  def run(self) -> None:
33
38
  indexer_config = ConfluenceIndexerConfig(
34
- spaces=self.__data.spaces
39
+ spaces=self.__data.spaces,
40
+ max_num_of_docs_from_each_space=self.__data.max_num_of_docs_from_each_space
35
41
  )
36
42
  downloader_config = ConfluenceDownloaderConfig(
37
43
  download_dir=self.working_directory
38
44
  )
39
45
  connection_config = ConfluenceConnectionConfig(
40
- access_config=ConfluenceAccessConfig(api_token=self.__data.access_token),
46
+ access_config=ConfluenceAccessConfig(password=self.__data.password, token=self.__data.access_token),
41
47
  url=self.__data.url,
42
- user_email=self.__data.user_email
48
+ username=self.__data.username
43
49
  )
44
- self.__unstructured_ingest.pipeline(
50
+ pipeline: Pipeline = self.__unstructured_ingest.pipeline(
45
51
  indexer_config,
46
52
  downloader_config,
47
53
  connection_config,
48
- extension=self.__data.extension).run()
54
+ extension=self.__data.extension
55
+ )
56
+ pipeline.indexer_step.process = CustomConfluenceIndexer(**vars(pipeline.indexer_step.process))
57
+ pipeline.run()
49
58
  async def load(self) -> list[Document]:
50
59
  await asyncio.to_thread(self.run)
51
60
  await asyncio.sleep(1)
52
61
  return await Loader(self.working_directory).load()
53
62
 
63
+ class CustomConfluenceIndexer(ConfluenceIndexer):
64
+ def __init__(self, **kwargs):
65
+ for key, value in kwargs.items():
66
+ try:
67
+ setattr(super(), key, value)
68
+ except AttributeError:
69
+ setattr(self, key, value)
70
+ def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
71
+ with self.connection_config.get_client() as client:
72
+ pages = client.get_all_pages_from_space(
73
+ space=space_key,
74
+ start=0,
75
+ limit=self.index_config.max_num_of_docs_from_each_space, #explicitly limit the number of pages fetched (omitted in unstructured-ingest)
76
+ expand=None,
77
+ content_type="page", # blogpost and comment types not currently supported
78
+ status=None,
79
+ )
80
+ limited_pages = pages[: self.index_config.max_num_of_docs_from_each_space]
81
+ doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in limited_pages]
82
+ return doc_ids
83
+
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,10 +1,12 @@
1
1
  import asyncio
2
2
  from typing import Optional, Union
3
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
4
- from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
5
- from unstructured_ingest.connector.git import GitAccessConfig
6
- from unstructured_ingest.connector.github import SimpleGitHubConfig
7
- from unstructured_ingest.runner import GithubRunner
3
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
4
+ from unstructured_ingest.processes.connectors.github import (
5
+ GithubIndexerConfig,
6
+ GithubDownloaderConfig,
7
+ GithubConnectionConfig,
8
+ GithubAccessConfig
9
+ )
8
10
  from langchain_core.documents import Document
9
11
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
10
12
  from pydantic import BaseModel, Field, AliasChoices
@@ -27,28 +29,26 @@ class Github(IntegrationStrategy):
27
29
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
28
30
  super().__init__(knowledgebase_path, data)
29
31
  self.__data = GithubParams.model_validate(self.data)
32
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
30
33
  def working_subdirectory(self) -> str:
31
34
  return 'github'
32
35
  def run(self) -> None:
33
- access_config = GitAccessConfig(
34
- access_token=self.__data.access_token
35
- )
36
- file_ext = self.__data.file_ext or None
37
- file_glob = [f"**/*{ext}" for ext in file_ext] if file_ext else None
38
- config = SimpleGitHubConfig(
39
- url = self.__data.repo,
40
- access_config=access_config,
36
+ indexer_config = GithubIndexerConfig(
41
37
  branch=self.__data.branch,
42
- file_glob=file_glob
38
+ recursive=True
39
+ )
40
+ downloader_config = GithubDownloaderConfig(
41
+ download_dir=self.working_directory
42
+ )
43
+ connection_config = GithubConnectionConfig(
44
+ access_config=GithubAccessConfig(access_token=self.__data.access_token),
45
+ url=self.__data.repo
43
46
  )
44
- runner = GithubRunner(
45
- connector_config=config,
46
- processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
47
- read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
48
- partition_config=None,
49
- retry_strategy_config=None
50
- )
51
- runner.run()
47
+ self.__unstructured_ingest.pipeline(
48
+ indexer_config,
49
+ downloader_config,
50
+ connection_config,
51
+ extension=self.__data.file_ext).run()
52
52
  async def load(self) -> list[Document]:
53
53
  await asyncio.to_thread(self.run)
54
54
  await asyncio.sleep(1)
@@ -1,10 +1,38 @@
1
1
  import asyncio
2
+ import json
3
+ from pathlib import Path
2
4
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
5
+ from unstructured_ingest.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
6
+ from unstructured_ingest.data_types.file_data import FileData as OriginalFileData, BatchFileData as OriginalBatchFileData
4
7
  from langchain_core.documents import Document
5
8
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
9
  from typing import Union
7
10
  from pydantic import BaseModel, Field, AliasChoices
11
+
12
+ # UTF-8 safe FileData classes
13
+ class FileData(OriginalFileData):
14
+ @classmethod
15
+ def from_file(cls, path: str):
16
+ path = Path(path).resolve()
17
+ if not path.exists() or not path.is_file():
18
+ raise ValueError(f"file path not valid: {path}")
19
+ for encoding in ['utf-8', 'cp1252', 'iso-8859-1', 'latin-1']:
20
+ try:
21
+ with open(str(path), "r", encoding=encoding) as f:
22
+ return cls.model_validate(json.load(f))
23
+ except (UnicodeDecodeError, UnicodeError):
24
+ continue
25
+ raise ValueError(f"Could not decode file {path} with any supported encoding")
26
+
27
+ def to_file(self, path: str) -> None:
28
+ path = Path(path).resolve()
29
+ path.parent.mkdir(parents=True, exist_ok=True)
30
+ with open(str(path), "w", encoding="utf-8") as f:
31
+ json.dump(self.model_dump(), f, indent=2, ensure_ascii=False)
32
+
33
+ class BatchFileData(OriginalBatchFileData, FileData):
34
+ pass
35
+
8
36
  class GoogleDriveParams(BaseModel):
9
37
  """
10
38
  GoogleDriveParams is a model that holds parameters for Google Drive integration.
@@ -42,26 +70,27 @@ class GoogleDrive(IntegrationStrategy):
42
70
  super().__init__(knowledgebase_path, data)
43
71
  self.__data = GoogleDriveParams.model_validate(self.data)
44
72
  self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
73
+ self._apply_encoding_fix()
74
+
75
+ def _apply_encoding_fix(self):
76
+ """Replace FileData classes with UTF-8 safe versions"""
77
+ import unstructured_ingest.data_types.file_data as fd
78
+ fd.FileData = FileData
79
+ fd.BatchFileData = BatchFileData
80
+ fd.file_data_from_file = lambda path: BatchFileData.from_file(path) if path else FileData.from_file(path)
81
+
45
82
  def working_subdirectory(self) -> str:
46
83
  return 'googledrive'
84
+
47
85
  def run(self) -> None:
48
- indexer_config = GoogleDriveIndexerConfig(
49
- extensions=self.__data.extensions,
50
- recursive=self.__data.recursive
51
- )
52
- downloader_config = GoogleDriveDownloaderConfig(
53
- download_dir=self.working_directory
54
- )
55
- connection_config = GoogleDriveConnectionConfig(
56
- access_config=GoogleDriveAccessConfig(
57
- service_account_key=self.__data.service_account_key
58
- ),
59
- drive_id=self.__data.drive_id
60
- )
61
86
  self.__unstructured_ingest.pipeline(
62
- indexer_config,
63
- downloader_config,
64
- connection_config).run()
87
+ GoogleDriveIndexerConfig(extensions=self.__data.extensions, recursive=self.__data.recursive),
88
+ GoogleDriveDownloaderConfig(download_dir=self.working_directory),
89
+ GoogleDriveConnectionConfig(
90
+ access_config=GoogleDriveAccessConfig(service_account_key=self.__data.service_account_key),
91
+ drive_id=self.__data.drive_id
92
+ )
93
+ ).run()
65
94
  async def load(self) -> list[Document]:
66
95
  await asyncio.to_thread(self.run)
67
96
  await asyncio.sleep(1)
@@ -1,114 +1,151 @@
1
- import asyncio
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
- from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
4
- from unstructured_ingest.connector.jira import SimpleJiraConfig, JiraAccessConfig
5
- from unstructured_ingest.runner import JiraRunner
1
+ import logging
2
+ import asyncio, os
3
+ import sys
4
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
6
5
  from langchain_core.documents import Document
7
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
8
7
  from pydantic import BaseModel, Field, AliasChoices
9
- from typing import Optional, Union
10
- import requests
11
- import unstructured_ingest.connector.jira
8
+ from typing import Any, Generator, Iterable, Optional, Union
9
+ from unstructured_ingest.pipeline.pipeline import Pipeline
10
+ from unstructured_ingest.processes.connectors.jira import (
11
+ JiraIndexerConfig,
12
+ JiraIndexer,
13
+ JiraIssueMetadata,
14
+ api_page_based_generator,
15
+ JiraDownloaderConfig,
16
+ JiraDownloader,
17
+ DEFAULT_C_SEP,
18
+ DEFAULT_R_SEP,
19
+ JiraConnectionConfig,
20
+ JiraAccessConfig
21
+ )
22
+ from unstructured_ingest.pipeline.pipeline import (
23
+ Pipeline,
24
+ PartitionerConfig,
25
+ FiltererConfig
26
+ )
27
+ from unstructured_ingest.interfaces import ProcessorConfig
12
28
 
13
29
  class JiraParams(BaseModel):
14
30
  """
15
31
  JiraParams is a Pydantic model that represents the parameters required to interact with a Jira instance.
32
+ Docs: https://docs.unstructured.io/open-source/ingestion/source-connectors/jira#jira
16
33
 
17
34
  Attributes:
18
35
  url (str): The URL of the Jira instance, e.g., 'https://example.atlassian.net'.
19
- access_token (str): The access token for authenticating with the Jira API.
36
+ access_token (str): The access token for authenticating with the Jira API: https://id.atlassian.com/manage-profile/security/api-tokens
20
37
  user_email (str): The email address of the Jira user.
21
38
  projects (list[str]): A list of project keys or IDs to interact with, e.g., ['SCRUM', 'PROJ1'].
22
39
  boards (Optional[list[str]]): An optional list of board IDs to interact with. Defaults to None, e.g., ['1', '2'].
23
40
  issues (Optional[list[str]]): An optional list of issue keys or IDs to interact with. Defaults to None, e.g., ['SCRUM-1', 'PROJ1-1'].
24
41
  """
25
- url: str
26
- access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
27
- user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"))
42
+ url: str = Field(..., pattern=r'^https?:\/\/.+')
43
+ access_token: str = Field(..., validation_alias=AliasChoices("accessToken","access_token"), min_length=1)
44
+ user_email: str = Field(validation_alias=AliasChoices("userEmail","user_email"), min_length=1)
28
45
  projects: list[str]
29
46
  boards: Optional[list[str]] | None = None
30
47
  issues: Optional[list[str]] | None = None
31
- fieldsMappingUrl: Optional[str] | None = None
48
+ status_filters: Optional[list[str]] | None = None
32
49
 
33
50
  class Jira(IntegrationStrategy):
34
- DEFAULT_C_SEP = " " * 5
35
- DEFAULT_R_SEP = "\n"
36
51
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
37
52
  super().__init__(knowledgebase_path, data)
38
53
  self.__data = JiraParams.model_validate(self.data)
54
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
39
55
  def working_subdirectory(self) -> str:
40
56
  return 'jira'
41
57
  def run(self) -> None:
42
- unstructured_ingest.connector.jira._get_dropdown_fields_for_issue = self._get_dropdown_fields_for_issue
43
- access_config = JiraAccessConfig(
44
- api_token=self.__data.access_token
45
- )
46
- config = SimpleJiraConfig(
47
- user_email=self.__data.user_email,
48
- url = self.__data.url,
49
- access_config=access_config,
58
+ indexer_config = JiraIndexerConfig(
50
59
  projects=self.__data.projects,
51
60
  boards=self.__data.boards,
52
- issues=self.__data.issues
53
- )
54
- runner = JiraRunner(
55
- connector_config=config,
56
- processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
57
- read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
58
- partition_config=None,
59
- retry_strategy_config=None
61
+ issues=self.__data.issues,
62
+ status_filters=self.__data.status_filters
60
63
  )
61
- runner.run()
64
+ downloader_config = JiraDownloaderConfig(
65
+ download_dir=self.working_directory,
66
+ download_attachments=False
67
+ )
68
+ _is_cloud = "atlassian.net" in self.__data.url
69
+ _access_config = JiraAccessConfig(token=self.__data.access_token) \
70
+ if not _is_cloud \
71
+ else JiraAccessConfig(password=self.__data.access_token)
72
+ connection_config = JiraConnectionConfig(
73
+ access_config=_access_config,
74
+ username=self.__data.user_email,
75
+ url=self.__data.url,
76
+ cloud=_is_cloud
77
+ )
78
+ pipeline: Pipeline = self.__unstructured_ingest.pipeline(
79
+ indexer_config,
80
+ downloader_config,
81
+ connection_config,
82
+ extension=None)
83
+ if _is_cloud and sys.platform == "win32":
84
+ pipeline.indexer_step.process = CustomJiraIndexer(**vars(pipeline.indexer_step.process))
85
+ pipeline.downloader_step.process = CustomJiraDownloader(**vars(pipeline.downloader_step.process))
86
+ pipeline.run()
62
87
  async def load(self) -> list[Document]:
63
88
  await asyncio.to_thread(self.run)
64
89
  await asyncio.sleep(1)
65
90
  return await Loader(self.working_directory).load()
66
91
 
67
- def _remap_custom_fields(self, field_list):
68
- auth = (self.__data.user_email, self.__data.access_token)
69
- response = requests.get(self.__data.fieldsMappingUrl, auth=auth)
70
92
 
71
- if response.status_code == 200:
72
- mapper: dict = response.json()
73
- remapped_field_list = {}
74
- for field_key, field_value in field_list.items():
75
- new_key = None
76
- for map_item in mapper:
77
- if field_key == map_item["id"]:
78
- # Usa il nome mappato come nuova chiave
79
- new_key = map_item["name"]
80
- break
81
-
82
- if new_key is None:
83
- new_key = field_key
93
+ # region override
94
+ class CustomJiraIndexer(JiraIndexer):
95
+ """
96
+ fix default run_jql for cloud: missing enhanced_jql
97
+ """
98
+ import sys
99
+ def __init__(self, **kwargs):
100
+ for key, value in kwargs.items():
101
+ try:
102
+ setattr(super(), key, value)
103
+ except AttributeError:
104
+ setattr(self, key, value)
105
+ def run_jql(self, jql: str, **kwargs) -> Generator[JiraIssueMetadata, None, None]:
106
+ with self.connection_config.get_client() as client:
107
+ for issue in api_page_based_generator(client.jql, jql=jql, **kwargs):
108
+ yield JiraIssueMetadata.model_validate(issue)
84
109
 
85
- remapped_field_list[new_key] = field_value
110
+ class CustomJiraDownloader(JiraDownloader):
111
+ CUSTOM_FIELDS: list | None = None
112
+ def _set_custom_fields(self) -> list:
113
+ with self.connection_config.get_client() as client:
114
+ _custom_fields = client.get_all_custom_fields()
115
+ return [{"id": item["id"], "name": item["name"]} for item in _custom_fields]
116
+ def __init__(self, **kwargs):
117
+ for key, value in kwargs.items():
118
+ try:
119
+ setattr(super(), key, value)
120
+ except AttributeError:
121
+ setattr(self, key, value)
122
+ if not self.CUSTOM_FIELDS:
123
+ self.CUSTOM_FIELDS = self._set_custom_fields()
86
124
 
87
- return remapped_field_list
125
+ def _get_custom_fields_for_issue(self, issue: dict, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP) -> str:
126
+ def _parse_value(value: Any) -> Any:
127
+ if isinstance(value, dict):
128
+ _candidate = ["displayName", "name", "value"]
129
+ for item in _candidate:
130
+ if item in value:
131
+ return value[item]
132
+ return value
133
+ def _remap_custom_fields(fields: dict):
134
+ remapped_fields = {}
135
+ for field_key, field_value in fields.items():
136
+ new_key = next((map_item["name"] for map_item in self.CUSTOM_FIELDS if field_key == map_item["id"]), field_key)
137
+ if new_key != field_value:
138
+ remapped_fields[new_key] = field_value
139
+ return remapped_fields
140
+ filtered_fields = {key: _parse_value(value) for key, value in issue.items() if value is not None and type(value) not in [list]}
141
+ custom_fields =_remap_custom_fields(filtered_fields)
142
+ return (r_sep + c_sep ).join([f"{key}: {value}{r_sep}" for key, value in custom_fields.items()])
88
143
 
89
- def _get_dropdown_fields_for_issue(self, issue, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP):
90
- all_fields = {}
91
- for key, value in issue.items():
92
- if value is not None:
93
- if isinstance(value, list) and (len(value) > 0):
94
- all_fields[key] = value
95
- else:
96
- all_fields[key] = value
97
- mapped_fields = self._remap_custom_fields(all_fields)
98
- return f"""
99
- IssueType:{issue["issuetype"]["name"]}
100
- {r_sep}
101
- Status:{issue["status"]["name"]}
102
- {r_sep}
103
- Priority:{issue["priority"]}
104
- {r_sep}
105
- AssigneeID_Name:{issue["assignee"]["accountId"]}{c_sep}{issue["assignee"]["displayName"]}
106
- {r_sep}
107
- ReporterAdr_Name:{issue["reporter"]["emailAddress"]}{c_sep}{issue["reporter"]["displayName"]}
108
- {r_sep}
109
- Labels:{c_sep.join(issue["labels"])}
110
- {r_sep}
111
- Components:{c_sep.join([component["name"] for component in issue["components"]])}
144
+ def _get_text_fields_for_issue(self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP) -> str:
145
+ #no need any more: original data will be included in the custom fields
146
+ #_origin = super()._get_text_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
147
+ _custom_fields = self._get_custom_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
148
+ return f"""Details:
112
149
  {r_sep}
113
- {(r_sep + c_sep ).join([f"{key}:{value}{r_sep}" for key, value in mapped_fields.items()])}
114
- """
150
+ {_custom_fields}"""
151
+ # endregion
@@ -12,7 +12,9 @@ from ws_bom_robot_app.llm.vector_store.integration.sftp import Sftp
12
12
  from ws_bom_robot_app.llm.vector_store.integration.sharepoint import Sharepoint
13
13
  from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
14
14
  from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
15
-
15
+ from ws_bom_robot_app.llm.vector_store.integration.thron import Thron
16
+ from ws_bom_robot_app.llm.vector_store.integration.shopify import Shopify
17
+ from ws_bom_robot_app.llm.vector_store.integration.api import Api
16
18
  class IntegrationManager:
17
19
  _list: dict[str, Type[IntegrationStrategy]] = {
18
20
  "llmkbazure": Azure,
@@ -27,7 +29,9 @@ class IntegrationManager:
27
29
  "llmkbsharepoint": Sharepoint,
28
30
  "llmkbsitemap": Sitemap,
29
31
  "llmkbslack": Slack,
30
-
32
+ "llmkbthron": Thron,
33
+ "llmkbshopify": Shopify,
34
+ "llmkbapi": Api,
31
35
  }
32
36
  @classmethod
33
37
  def get_strategy(cls, name: str, knowledgebase_path: str, data: dict[str, str]) -> IntegrationStrategy:
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,7 +1,7 @@
1
1
  import asyncio, logging, traceback
2
2
  from dataclasses import dataclass
3
3
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
4
- from unstructured_ingest.v2.processes.connectors.sharepoint import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
4
+ from unstructured_ingest.processes.connectors.sharepoint import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
5
5
  from langchain_core.documents import Document
6
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
7
  from typing import Union, Optional
@@ -14,22 +14,18 @@ class SharepointParams(BaseModel):
14
14
  Attributes:
15
15
  client_id (str): The client ID for SharePoint authentication.
16
16
  client_secret (str): The client secret for SharePoint authentication.
17
+ tenant_id (str, optional): The tenant ID for SharePoint authentication. Defaults to None.
17
18
  site_url (str): The URL of the SharePoint site. i.e. site collection level: https://<tenant>.sharepoint.com/sites/<site-collection-name>, or root site: https://<tenant>.sharepoint.com
18
19
  site_path (str, optional): TThe path in the SharePoint site from which to start parsing files, for example "Shared Documents". Defaults to None.
19
20
  recursive (bool, optional): Whether to recursively access subdirectories. Defaults to False.
20
- omit_files (bool, optional): Whether to omit files from the results. Defaults to False.
21
- omit_pages (bool, optional): Whether to omit pages from the results. Defaults to False.
22
- omit_lists (bool, optional): Whether to omit lists from the results. Defaults to False.
23
21
  extension (list[str], optional): A list of file extensions to include, i.e. [".pdf"] Defaults to None.
24
22
  """
25
23
  client_id : str = Field(validation_alias=AliasChoices("clientId","client_id"))
26
24
  client_secret : str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
27
25
  site_url: str = Field(validation_alias=AliasChoices("siteUrl","site_url"))
28
26
  site_path: str = Field(default=None,validation_alias=AliasChoices("sitePath","site_path"))
27
+ tenant_id: str = Field(default=None, validation_alias=AliasChoices("tenantId","tenant_id"))
29
28
  recursive: bool = Field(default=False)
30
- omit_files: bool = Field(default=False, validation_alias=AliasChoices("omitFiles","omit_files")),
31
- omit_pages: bool = Field(default=False, validation_alias=AliasChoices("omitPages","omit_pages")),
32
- omit_lists: bool = Field(default=False, validation_alias=AliasChoices("omitLists","omit_lists")),
33
29
  extension: list[str] = Field(default=None)
34
30
  class Sharepoint(IntegrationStrategy):
35
31
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
@@ -41,10 +37,7 @@ class Sharepoint(IntegrationStrategy):
41
37
  def run(self) -> None:
42
38
  indexer_config = SharepointIndexerConfig(
43
39
  path=self.__data.site_path,
44
- recursive=self.__data.recursive,
45
- omit_files=self.__data.omit_files,
46
- omit_pages=self.__data.omit_pages,
47
- omit_lists=self.__data.omit_lists
40
+ recursive=self.__data.recursive
48
41
  )
49
42
  downloader_config = SharepointDownloaderConfig(
50
43
  download_dir=self.working_directory
@@ -53,15 +46,15 @@ class Sharepoint(IntegrationStrategy):
53
46
  access_config=SharepointAccessConfig(client_cred=self.__data.client_secret),
54
47
  client_id=self.__data.client_id,
55
48
  site=self.__data.site_url,
56
- permissions_config=None
49
+ tenant= self.__data.tenant_id if self.__data.tenant_id else None
57
50
  )
58
51
  pipeline = self.__unstructured_ingest.pipeline(
59
52
  indexer_config,
60
53
  downloader_config,
61
54
  connection_config,
62
55
  extension=self.__data.extension)
63
- current_indexer_process = pipeline.indexer_step.process
64
- pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
56
+ #current_indexer_process = pipeline.indexer_step.process
57
+ #pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
65
58
  pipeline.run()
66
59
  async def load(self) -> list[Document]:
67
60
  await asyncio.to_thread(self.run)