ws-bom-robot-app 0.0.63__py3-none-any.whl → 0.0.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. ws_bom_robot_app/config.py +30 -8
  2. ws_bom_robot_app/cron_manager.py +13 -12
  3. ws_bom_robot_app/llm/agent_context.py +1 -1
  4. ws_bom_robot_app/llm/agent_handler.py +11 -12
  5. ws_bom_robot_app/llm/agent_lcel.py +80 -18
  6. ws_bom_robot_app/llm/api.py +69 -7
  7. ws_bom_robot_app/llm/evaluator.py +319 -0
  8. ws_bom_robot_app/llm/main.py +51 -28
  9. ws_bom_robot_app/llm/models/api.py +40 -6
  10. ws_bom_robot_app/llm/nebuly_handler.py +18 -15
  11. ws_bom_robot_app/llm/providers/llm_manager.py +233 -75
  12. ws_bom_robot_app/llm/tools/tool_builder.py +4 -1
  13. ws_bom_robot_app/llm/tools/tool_manager.py +48 -22
  14. ws_bom_robot_app/llm/utils/chunker.py +6 -1
  15. ws_bom_robot_app/llm/utils/cleanup.py +81 -0
  16. ws_bom_robot_app/llm/utils/cms.py +60 -14
  17. ws_bom_robot_app/llm/utils/download.py +112 -8
  18. ws_bom_robot_app/llm/vector_store/db/base.py +50 -0
  19. ws_bom_robot_app/llm/vector_store/db/chroma.py +28 -8
  20. ws_bom_robot_app/llm/vector_store/db/faiss.py +35 -8
  21. ws_bom_robot_app/llm/vector_store/db/qdrant.py +29 -14
  22. ws_bom_robot_app/llm/vector_store/integration/api.py +216 -0
  23. ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
  24. ws_bom_robot_app/llm/vector_store/integration/base.py +58 -15
  25. ws_bom_robot_app/llm/vector_store/integration/confluence.py +33 -5
  26. ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
  27. ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
  28. ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
  29. ws_bom_robot_app/llm/vector_store/integration/googledrive.py +46 -17
  30. ws_bom_robot_app/llm/vector_store/integration/jira.py +93 -60
  31. ws_bom_robot_app/llm/vector_store/integration/manager.py +6 -2
  32. ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
  33. ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
  34. ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
  35. ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
  36. ws_bom_robot_app/llm/vector_store/integration/sitemap.py +6 -1
  37. ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
  38. ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -0
  39. ws_bom_robot_app/llm/vector_store/loader/base.py +52 -8
  40. ws_bom_robot_app/llm/vector_store/loader/docling.py +71 -33
  41. ws_bom_robot_app/main.py +148 -146
  42. ws_bom_robot_app/subprocess_runner.py +106 -0
  43. ws_bom_robot_app/task_manager.py +204 -53
  44. ws_bom_robot_app/util.py +6 -0
  45. {ws_bom_robot_app-0.0.63.dist-info → ws_bom_robot_app-0.0.103.dist-info}/METADATA +158 -75
  46. ws_bom_robot_app-0.0.103.dist-info/RECORD +76 -0
  47. ws_bom_robot_app/llm/settings.py +0 -4
  48. ws_bom_robot_app/llm/utils/kb.py +0 -34
  49. ws_bom_robot_app-0.0.63.dist-info/RECORD +0 -72
  50. {ws_bom_robot_app-0.0.63.dist-info → ws_bom_robot_app-0.0.103.dist-info}/WHEEL +0 -0
  51. {ws_bom_robot_app-0.0.63.dist-info → ws_bom_robot_app-0.0.103.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,39 @@
1
+ import logging
1
2
  import asyncio, os
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ import sys
4
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
5
  from langchain_core.documents import Document
4
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
5
7
  from pydantic import BaseModel, Field, AliasChoices
6
- from typing import Any, Optional, Union
7
- from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
8
- from unstructured_ingest.connector.jira import SimpleJiraConfig, JiraAccessConfig, JiraSourceConnector, JiraIngestDoc, nested_object_to_field_getter, _get_id_fields_for_issue, _get_project_fields_for_issue
9
- from unstructured_ingest.runner import JiraRunner
10
-
8
+ from typing import Any, Generator, Iterable, Optional, Union
9
+ from unstructured_ingest.pipeline.pipeline import Pipeline
10
+ from unstructured_ingest.processes.connectors.jira import (
11
+ JiraIndexerConfig,
12
+ JiraIndexer,
13
+ JiraIssueMetadata,
14
+ api_page_based_generator,
15
+ JiraDownloaderConfig,
16
+ JiraDownloader,
17
+ DEFAULT_C_SEP,
18
+ DEFAULT_R_SEP,
19
+ JiraConnectionConfig,
20
+ JiraAccessConfig
21
+ )
22
+ from unstructured_ingest.pipeline.pipeline import (
23
+ Pipeline,
24
+ PartitionerConfig,
25
+ FiltererConfig
26
+ )
27
+ from unstructured_ingest.interfaces import ProcessorConfig
11
28
 
12
29
  class JiraParams(BaseModel):
13
30
  """
14
31
  JiraParams is a Pydantic model that represents the parameters required to interact with a Jira instance.
32
+ Docs: https://docs.unstructured.io/open-source/ingestion/source-connectors/jira#jira
15
33
 
16
34
  Attributes:
17
35
  url (str): The URL of the Jira instance, e.g., 'https://example.atlassian.net'.
18
- access_token (str): The access token for authenticating with the Jira API.
36
+ access_token (str): The access token for authenticating with the Jira API: https://id.atlassian.com/manage-profile/security/api-tokens
19
37
  user_email (str): The email address of the Jira user.
20
38
  projects (list[str]): A list of project keys or IDs to interact with, e.g., ['SCRUM', 'PROJ1'].
21
39
  boards (Optional[list[str]]): An optional list of board IDs to interact with. Defaults to None, e.g., ['1', '2'].
@@ -27,34 +45,45 @@ class JiraParams(BaseModel):
27
45
  projects: list[str]
28
46
  boards: Optional[list[str]] | None = None
29
47
  issues: Optional[list[str]] | None = None
48
+ status_filters: Optional[list[str]] | None = None
30
49
 
31
50
  class Jira(IntegrationStrategy):
32
51
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
33
52
  super().__init__(knowledgebase_path, data)
34
53
  self.__data = JiraParams.model_validate(self.data)
54
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
35
55
  def working_subdirectory(self) -> str:
36
56
  return 'jira'
37
57
  def run(self) -> None:
38
- access_config = JiraAccessConfig(
39
- api_token=self.__data.access_token
40
- )
41
- config = SimpleJiraConfig(
42
- user_email=self.__data.user_email,
43
- url = self.__data.url,
44
- access_config=access_config,
58
+ indexer_config = JiraIndexerConfig(
45
59
  projects=self.__data.projects,
46
60
  boards=self.__data.boards,
47
- issues=self.__data.issues
48
- )
49
- # runner override: waiting for v2 migration https://github.com/Unstructured-IO/unstructured-ingest/issues/106
50
- runner = _JiraRunner(
51
- connector_config=config,
52
- processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
53
- read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
54
- partition_config=None,
55
- retry_strategy_config=None
61
+ issues=self.__data.issues,
62
+ status_filters=self.__data.status_filters
56
63
  )
57
- runner.run()
64
+ downloader_config = JiraDownloaderConfig(
65
+ download_dir=self.working_directory,
66
+ download_attachments=False
67
+ )
68
+ _is_cloud = "atlassian.net" in self.__data.url
69
+ _access_config = JiraAccessConfig(token=self.__data.access_token) \
70
+ if not _is_cloud \
71
+ else JiraAccessConfig(password=self.__data.access_token)
72
+ connection_config = JiraConnectionConfig(
73
+ access_config=_access_config,
74
+ username=self.__data.user_email,
75
+ url=self.__data.url,
76
+ cloud=_is_cloud
77
+ )
78
+ pipeline: Pipeline = self.__unstructured_ingest.pipeline(
79
+ indexer_config,
80
+ downloader_config,
81
+ connection_config,
82
+ extension=None)
83
+ if _is_cloud and sys.platform == "win32":
84
+ pipeline.indexer_step.process = CustomJiraIndexer(**vars(pipeline.indexer_step.process))
85
+ pipeline.downloader_step.process = CustomJiraDownloader(**vars(pipeline.downloader_step.process))
86
+ pipeline.run()
58
87
  async def load(self) -> list[Document]:
59
88
  await asyncio.to_thread(self.run)
60
89
  await asyncio.sleep(1)
@@ -62,8 +91,38 @@ class Jira(IntegrationStrategy):
62
91
 
63
92
 
64
93
  # region override
65
- class _JiraIngestDoc(JiraIngestDoc):
66
- def _get_dropdown_custom_fields_for_issue(issue: dict, c_sep=" " * 5, r_sep="\n") -> str:
94
+ class CustomJiraIndexer(JiraIndexer):
95
+ """
96
+ fix default run_jql for cloud: missing enhanced_jql
97
+ """
98
+ import sys
99
+ def __init__(self, **kwargs):
100
+ for key, value in kwargs.items():
101
+ try:
102
+ setattr(super(), key, value)
103
+ except AttributeError:
104
+ setattr(self, key, value)
105
+ def run_jql(self, jql: str, **kwargs) -> Generator[JiraIssueMetadata, None, None]:
106
+ with self.connection_config.get_client() as client:
107
+ for issue in api_page_based_generator(client.jql, jql=jql, **kwargs):
108
+ yield JiraIssueMetadata.model_validate(issue)
109
+
110
+ class CustomJiraDownloader(JiraDownloader):
111
+ CUSTOM_FIELDS: list | None = None
112
+ def _set_custom_fields(self) -> list:
113
+ with self.connection_config.get_client() as client:
114
+ _custom_fields = client.get_all_custom_fields()
115
+ return [{"id": item["id"], "name": item["name"]} for item in _custom_fields]
116
+ def __init__(self, **kwargs):
117
+ for key, value in kwargs.items():
118
+ try:
119
+ setattr(super(), key, value)
120
+ except AttributeError:
121
+ setattr(self, key, value)
122
+ if not self.CUSTOM_FIELDS:
123
+ self.CUSTOM_FIELDS = self._set_custom_fields()
124
+
125
+ def _get_custom_fields_for_issue(self, issue: dict, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP) -> str:
67
126
  def _parse_value(value: Any) -> Any:
68
127
  if isinstance(value, dict):
69
128
  _candidate = ["displayName", "name", "value"]
@@ -74,45 +133,19 @@ class _JiraIngestDoc(JiraIngestDoc):
74
133
  def _remap_custom_fields(fields: dict):
75
134
  remapped_fields = {}
76
135
  for field_key, field_value in fields.items():
77
- new_key = next((map_item["name"] for map_item in _JiraSourceConnector.CUSTOM_FIELDS if field_key == map_item["id"]), field_key)
136
+ new_key = next((map_item["name"] for map_item in self.CUSTOM_FIELDS if field_key == map_item["id"]), field_key)
78
137
  if new_key != field_value:
79
138
  remapped_fields[new_key] = field_value
80
139
  return remapped_fields
81
140
  filtered_fields = {key: _parse_value(value) for key, value in issue.items() if value is not None and type(value) not in [list]}
82
141
  custom_fields =_remap_custom_fields(filtered_fields)
83
142
  return (r_sep + c_sep ).join([f"{key}: {value}{r_sep}" for key, value in custom_fields.items()])
84
- def __init__(self, *args, **kwargs):
85
- super().__init__(*args, **kwargs)
86
- _issue = self.issue
87
- _nested: dict = nested_object_to_field_getter(_issue["fields"])
88
- document = "\n\n\n".join(
89
- [
90
- _get_id_fields_for_issue(_issue),
91
- _get_project_fields_for_issue(_nested),
92
- _JiraIngestDoc._get_dropdown_custom_fields_for_issue(_nested)
93
- ],
94
- )
95
- _full_filename = str(self.filename)
96
- _file_extension = _full_filename.split(".")[-1]
97
- _file_without_extension = _full_filename.replace(f".{_file_extension}","")
98
- os.makedirs(os.path.dirname(_file_without_extension), exist_ok=True)
99
- with open(f"{_file_without_extension}_extra.{_file_extension}", "w", encoding="utf8") as f:
100
- f.write(document)
101
-
102
- class _JiraSourceConnector(JiraSourceConnector):
103
- CUSTOM_FIELDS: list | None = None
104
- def __set_custom_fields(self) -> None:
105
- _custom_fields = self.jira.get_all_custom_fields()
106
- _JiraSourceConnector.CUSTOM_FIELDS = [{"id":item["id"],"name":item["name"]} for item in _custom_fields]
107
- self._jira = None # fix serialization
108
- def __init__(self, *args, **kwargs):
109
- super().__init__(*args, **kwargs)
110
- if not _JiraSourceConnector.CUSTOM_FIELDS:
111
- self.__set_custom_fields()
112
- def get_ingest_docs(self) -> list[_JiraIngestDoc]:
113
- return [_JiraIngestDoc(**item.__dict__) for item in super().get_ingest_docs()]
114
143
 
115
- class _JiraRunner(JiraRunner):
116
- def get_source_connector_cls(self):
117
- return _JiraSourceConnector
144
+ def _get_text_fields_for_issue(self, issue: dict, c_sep: str = DEFAULT_C_SEP, r_sep: str = DEFAULT_R_SEP) -> str:
145
+ #no need any more: original data will be included in the custom fields
146
+ #_origin = super()._get_text_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
147
+ _custom_fields = self._get_custom_fields_for_issue(issue, c_sep=c_sep, r_sep=r_sep)
148
+ return f"""Details:
149
+ {r_sep}
150
+ {_custom_fields}"""
118
151
  # endregion
@@ -12,7 +12,9 @@ from ws_bom_robot_app.llm.vector_store.integration.sftp import Sftp
12
12
  from ws_bom_robot_app.llm.vector_store.integration.sharepoint import Sharepoint
13
13
  from ws_bom_robot_app.llm.vector_store.integration.sitemap import Sitemap
14
14
  from ws_bom_robot_app.llm.vector_store.integration.slack import Slack
15
-
15
+ from ws_bom_robot_app.llm.vector_store.integration.thron import Thron
16
+ from ws_bom_robot_app.llm.vector_store.integration.shopify import Shopify
17
+ from ws_bom_robot_app.llm.vector_store.integration.api import Api
16
18
  class IntegrationManager:
17
19
  _list: dict[str, Type[IntegrationStrategy]] = {
18
20
  "llmkbazure": Azure,
@@ -27,7 +29,9 @@ class IntegrationManager:
27
29
  "llmkbsharepoint": Sharepoint,
28
30
  "llmkbsitemap": Sitemap,
29
31
  "llmkbslack": Slack,
30
-
32
+ "llmkbthron": Thron,
33
+ "llmkbshopify": Shopify,
34
+ "llmkbapi": Api,
31
35
  }
32
36
  @classmethod
33
37
  def get_strategy(cls, name: str, knowledgebase_path: str, data: dict[str, str]) -> IntegrationStrategy:
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.s3 import S3ConnectionConfig, S3AccessConfig, S3DownloaderConfig, S3IndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.sftp import SftpConnectionConfig, SftpAccessConfig, SftpDownloaderConfig, SftpIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,7 +1,7 @@
1
1
  import asyncio, logging, traceback
2
2
  from dataclasses import dataclass
3
3
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
4
- from unstructured_ingest.v2.processes.connectors.sharepoint import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
4
+ from unstructured_ingest.processes.connectors.sharepoint import SharepointIndexerConfig, SharepointIndexer, SharepointDownloaderConfig, SharepointConnectionConfig, SharepointAccessConfig
5
5
  from langchain_core.documents import Document
6
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
7
  from typing import Union, Optional
@@ -14,22 +14,18 @@ class SharepointParams(BaseModel):
14
14
  Attributes:
15
15
  client_id (str): The client ID for SharePoint authentication.
16
16
  client_secret (str): The client secret for SharePoint authentication.
17
+ tenant_id (str, optional): The tenant ID for SharePoint authentication. Defaults to None.
17
18
  site_url (str): The URL of the SharePoint site. i.e. site collection level: https://<tenant>.sharepoint.com/sites/<site-collection-name>, or root site: https://<tenant>.sharepoint.com
18
19
  site_path (str, optional): TThe path in the SharePoint site from which to start parsing files, for example "Shared Documents". Defaults to None.
19
20
  recursive (bool, optional): Whether to recursively access subdirectories. Defaults to False.
20
- omit_files (bool, optional): Whether to omit files from the results. Defaults to False.
21
- omit_pages (bool, optional): Whether to omit pages from the results. Defaults to False.
22
- omit_lists (bool, optional): Whether to omit lists from the results. Defaults to False.
23
21
  extension (list[str], optional): A list of file extensions to include, i.e. [".pdf"] Defaults to None.
24
22
  """
25
23
  client_id : str = Field(validation_alias=AliasChoices("clientId","client_id"))
26
24
  client_secret : str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
27
25
  site_url: str = Field(validation_alias=AliasChoices("siteUrl","site_url"))
28
26
  site_path: str = Field(default=None,validation_alias=AliasChoices("sitePath","site_path"))
27
+ tenant_id: str = Field(default=None, validation_alias=AliasChoices("tenantId","tenant_id"))
29
28
  recursive: bool = Field(default=False)
30
- omit_files: bool = Field(default=False, validation_alias=AliasChoices("omitFiles","omit_files")),
31
- omit_pages: bool = Field(default=False, validation_alias=AliasChoices("omitPages","omit_pages")),
32
- omit_lists: bool = Field(default=False, validation_alias=AliasChoices("omitLists","omit_lists")),
33
29
  extension: list[str] = Field(default=None)
34
30
  class Sharepoint(IntegrationStrategy):
35
31
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
@@ -41,10 +37,7 @@ class Sharepoint(IntegrationStrategy):
41
37
  def run(self) -> None:
42
38
  indexer_config = SharepointIndexerConfig(
43
39
  path=self.__data.site_path,
44
- recursive=self.__data.recursive,
45
- omit_files=self.__data.omit_files,
46
- omit_pages=self.__data.omit_pages,
47
- omit_lists=self.__data.omit_lists
40
+ recursive=self.__data.recursive
48
41
  )
49
42
  downloader_config = SharepointDownloaderConfig(
50
43
  download_dir=self.working_directory
@@ -53,15 +46,15 @@ class Sharepoint(IntegrationStrategy):
53
46
  access_config=SharepointAccessConfig(client_cred=self.__data.client_secret),
54
47
  client_id=self.__data.client_id,
55
48
  site=self.__data.site_url,
56
- permissions_config=None
49
+ tenant= self.__data.tenant_id if self.__data.tenant_id else None
57
50
  )
58
51
  pipeline = self.__unstructured_ingest.pipeline(
59
52
  indexer_config,
60
53
  downloader_config,
61
54
  connection_config,
62
55
  extension=self.__data.extension)
63
- current_indexer_process = pipeline.indexer_step.process
64
- pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
56
+ #current_indexer_process = pipeline.indexer_step.process
57
+ #pipeline.indexer_step.process = CustomSharepointIndexer(**vars(current_indexer_process))
65
58
  pipeline.run()
66
59
  async def load(self) -> list[Document]:
67
60
  await asyncio.to_thread(self.run)
@@ -0,0 +1,143 @@
1
+ import asyncio, logging, aiohttp
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ from langchain_core.documents import Document
4
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
5
+ from typing import List, Union, Optional
6
+ from pydantic import BaseModel, Field, AliasChoices, field_validator
7
+ import json
8
+ import os
9
+
10
+ class ShopifyParams(BaseModel):
11
+ """
12
+ ShopifyParams is a model that defines the parameters required for Shopify integration.
13
+
14
+ Attributes:
15
+ shop_name (str): The shop name for Shopify.
16
+ access_token (str): The access token for Shopify.
17
+ graphql_query (Union[str, dict]): The GraphQL query string or dict for Shopify.
18
+ """
19
+ shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
20
+ access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
21
+ graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
22
+
23
+ @field_validator('graphql_query')
24
+ @classmethod
25
+ def extract_query_string(cls, v):
26
+ """Extract the query string from dict format if needed"""
27
+ if isinstance(v, dict) and 'query' in v:
28
+ return v['query']
29
+ return v
30
+
31
+ class Shopify(IntegrationStrategy):
32
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
33
+ super().__init__(knowledgebase_path, data)
34
+ self.__data = ShopifyParams.model_validate(self.data)
35
+
36
+ def working_subdirectory(self) -> str:
37
+ return 'shopify'
38
+
39
+ async def run(self) -> None:
40
+ _data = await self.__get_data()
41
+ json_file_path = os.path.join(self.working_directory, 'shopify_data.json')
42
+ with open(json_file_path, 'w', encoding='utf-8') as f:
43
+ json.dump(_data, f, ensure_ascii=False)
44
+
45
+ async def load(self) -> list[Document]:
46
+ await self.run()
47
+ await asyncio.sleep(1)
48
+ return await Loader(self.working_directory).load()
49
+
50
+ async def __get_data(self, page_size: int = 50) -> List[dict]:
51
+ # URL dell'API
52
+ url = f"https://{self.__data.shop_name}.myshopify.com/admin/api/2024-07/graphql.json"
53
+
54
+ # Headers
55
+ headers = {
56
+ "X-Shopify-Access-Token": self.__data.access_token,
57
+ "Content-Type": "application/json"
58
+ }
59
+
60
+ all_data: List[dict] = []
61
+ has_next_page = True
62
+ cursor = None
63
+ retry_count = 0
64
+ max_retries = 5
65
+
66
+ while has_next_page:
67
+ # Variables per la query
68
+ variables = {
69
+ "first": page_size
70
+ }
71
+
72
+ if cursor:
73
+ variables["after"] = cursor
74
+
75
+ # Payload della richiesta
76
+ payload = {
77
+ "query": self.__data.graphql_query,
78
+ "variables": variables
79
+ }
80
+
81
+ try:
82
+ # Effettua la richiesta
83
+ async with aiohttp.ClientSession() as session:
84
+ async with session.post(url, headers=headers, json=payload) as response:
85
+ # Controlla se la risposta è JSON
86
+ try:
87
+ data = await response.json()
88
+ except aiohttp.ContentTypeError:
89
+ text = await response.text()
90
+ logging.error(f"Non-JSON response received. Status code: {response.status}")
91
+ logging.error(f"Content: {text}")
92
+ raise Exception("Invalid response from API")
93
+
94
+ # Gestione del throttling
95
+ if "errors" in data:
96
+ error = data["errors"][0]
97
+ if error.get("extensions", {}).get("code") == "THROTTLED":
98
+ retry_count += 1
99
+ if retry_count > max_retries:
100
+ raise Exception("Too many throttling attempts. Stopping execution.")
101
+
102
+ # Aspetta un po' più a lungo ad ogni tentativo
103
+ wait_time = 2 ** retry_count # Backoff esponenziale
104
+ print(f"Rate limit reached. Waiting {wait_time} seconds... (Attempt {retry_count}/{max_retries})")
105
+ await asyncio.sleep(wait_time)
106
+ continue
107
+ else:
108
+ raise Exception(f"GraphQL errors: {data['errors']}")
109
+
110
+ # Resetta il contatore dei retry se la richiesta è andata bene
111
+ retry_count = 0
112
+
113
+ # Estrae i dati
114
+ _data = list(data["data"].values())[0]
115
+ edges = _data["edges"]
116
+ page_info = _data["pageInfo"]
117
+
118
+ # Aggiungi i dati alla lista
119
+ for edge in edges:
120
+ all_data.append(edge["node"])
121
+
122
+ # Aggiorna il cursore e il flag per la paginazione
123
+ has_next_page = page_info["hasNextPage"]
124
+ cursor = page_info["endCursor"]
125
+
126
+ print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_data)}")
127
+
128
+ # Piccola pausa per evitare di saturare l'API
129
+ await asyncio.sleep(0.1)
130
+
131
+ except aiohttp.ClientError as e:
132
+ logging.error(f"Connection error: {e}")
133
+ retry_count += 1
134
+ if retry_count <= max_retries:
135
+ wait_time = 2 ** retry_count
136
+ logging.warning(f"Retrying in {wait_time} seconds...")
137
+ await asyncio.sleep(wait_time)
138
+ continue
139
+ else:
140
+ raise Exception("Too many network errors. Stopping execution.")
141
+
142
+ logging.info(f"Data retrieval completed! Total data: {len(all_data)}")
143
+ return all_data
@@ -1,3 +1,4 @@
1
+ import sys, asyncio
1
2
  from typing import Any, AsyncGenerator, AsyncIterator
2
3
  import aiofiles
3
4
  import aiofiles.os
@@ -32,6 +33,7 @@ class Sitemap(IntegrationStrategy):
32
33
  self.__exclude_class: list[str] = self.data.get("excludeClass",[]) # type: ignore
33
34
  self.__exclude_id: list[str] = self.data.get("excludeId",[]) # type: ignore
34
35
  self.__restrict_to_same_domain: bool = self.data.get("restrictDomain", True) # type: ignore
36
+ self.__header_template = self.data.get("headers", None)
35
37
  def working_subdirectory(self) -> str:
36
38
  return ""
37
39
  def _extract(self, tag: Tag) -> str:
@@ -64,6 +66,8 @@ class Sitemap(IntegrationStrategy):
64
66
  return f"{self.knowledgebase_path}/{url}" if self._is_local(url) else url
65
67
  async def alazy_load(self,loader: SitemapLoader) -> AsyncIterator[Document]:
66
68
  """A lazy loader for Documents."""
69
+ if sys.platform == 'win32':
70
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
67
71
  iterator = await run_in_executor(None, loader.lazy_load)
68
72
  done = object()
69
73
  while True:
@@ -78,7 +82,8 @@ class Sitemap(IntegrationStrategy):
78
82
  filter_urls=self.__filter_urls,
79
83
  parsing_function=self._parse,
80
84
  is_local=self._is_local(self.__sitemap_url),
81
- restrict_to_same_domain=self.__restrict_to_same_domain
85
+ restrict_to_same_domain=self.__restrict_to_same_domain,
86
+ header_template=self.__header_template
82
87
  )
83
88
  _docs = self._output([document async for document in self.alazy_load(_loader)])
84
89
  if self._is_local(self.__sitemap_url):
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
3
+ from unstructured_ingest.interfaces.downloader import DownloaderConfig
4
+ from unstructured_ingest.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
4
5
  from langchain_core.documents import Document
5
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
7
  from typing import Union
@@ -39,7 +40,7 @@ class Slack(IntegrationStrategy):
39
40
  start_date=datetime.now() - timedelta(days=self.__data.num_days),
40
41
  end_date=datetime.now()
41
42
  )
42
- downloader_config = SlackDownloaderConfig(
43
+ downloader_config = DownloaderConfig(
43
44
  download_dir=self.working_directory
44
45
  )
45
46
  connection_config = SlackConnectionConfig(