ws-bom-robot-app 0.0.37__py3-none-any.whl → 0.0.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. ws_bom_robot_app/config.py +35 -7
  2. ws_bom_robot_app/cron_manager.py +15 -14
  3. ws_bom_robot_app/llm/agent_context.py +26 -0
  4. ws_bom_robot_app/llm/agent_description.py +123 -123
  5. ws_bom_robot_app/llm/agent_handler.py +176 -180
  6. ws_bom_robot_app/llm/agent_lcel.py +107 -54
  7. ws_bom_robot_app/llm/api.py +100 -7
  8. ws_bom_robot_app/llm/defaut_prompt.py +15 -15
  9. ws_bom_robot_app/llm/evaluator.py +319 -0
  10. ws_bom_robot_app/llm/feedbacks/__init__.py +0 -0
  11. ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -0
  12. ws_bom_robot_app/llm/main.py +159 -110
  13. ws_bom_robot_app/llm/models/api.py +70 -5
  14. ws_bom_robot_app/llm/models/feedback.py +30 -0
  15. ws_bom_robot_app/llm/nebuly_handler.py +185 -0
  16. ws_bom_robot_app/llm/providers/llm_manager.py +244 -80
  17. ws_bom_robot_app/llm/tools/models/main.py +8 -0
  18. ws_bom_robot_app/llm/tools/tool_builder.py +68 -23
  19. ws_bom_robot_app/llm/tools/tool_manager.py +343 -133
  20. ws_bom_robot_app/llm/tools/utils.py +41 -25
  21. ws_bom_robot_app/llm/utils/agent.py +34 -0
  22. ws_bom_robot_app/llm/utils/chunker.py +6 -1
  23. ws_bom_robot_app/llm/utils/cleanup.py +81 -0
  24. ws_bom_robot_app/llm/utils/cms.py +123 -0
  25. ws_bom_robot_app/llm/utils/download.py +183 -79
  26. ws_bom_robot_app/llm/utils/print.py +29 -29
  27. ws_bom_robot_app/llm/vector_store/db/__init__.py +0 -0
  28. ws_bom_robot_app/llm/vector_store/db/base.py +193 -0
  29. ws_bom_robot_app/llm/vector_store/db/chroma.py +97 -0
  30. ws_bom_robot_app/llm/vector_store/db/faiss.py +91 -0
  31. ws_bom_robot_app/llm/vector_store/db/manager.py +15 -0
  32. ws_bom_robot_app/llm/vector_store/db/qdrant.py +73 -0
  33. ws_bom_robot_app/llm/vector_store/generator.py +137 -137
  34. ws_bom_robot_app/llm/vector_store/integration/api.py +216 -0
  35. ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
  36. ws_bom_robot_app/llm/vector_store/integration/base.py +58 -15
  37. ws_bom_robot_app/llm/vector_store/integration/confluence.py +41 -11
  38. ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
  39. ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
  40. ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
  41. ws_bom_robot_app/llm/vector_store/integration/googledrive.py +46 -17
  42. ws_bom_robot_app/llm/vector_store/integration/jira.py +112 -75
  43. ws_bom_robot_app/llm/vector_store/integration/manager.py +6 -2
  44. ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
  45. ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
  46. ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
  47. ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
  48. ws_bom_robot_app/llm/vector_store/integration/sitemap.py +9 -1
  49. ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
  50. ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -0
  51. ws_bom_robot_app/llm/vector_store/loader/base.py +52 -8
  52. ws_bom_robot_app/llm/vector_store/loader/docling.py +71 -33
  53. ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
  54. ws_bom_robot_app/main.py +148 -146
  55. ws_bom_robot_app/subprocess_runner.py +106 -0
  56. ws_bom_robot_app/task_manager.py +207 -54
  57. ws_bom_robot_app/util.py +65 -20
  58. ws_bom_robot_app-0.0.103.dist-info/METADATA +364 -0
  59. ws_bom_robot_app-0.0.103.dist-info/RECORD +76 -0
  60. {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/WHEEL +1 -1
  61. ws_bom_robot_app/llm/settings.py +0 -4
  62. ws_bom_robot_app/llm/utils/agent_utils.py +0 -17
  63. ws_bom_robot_app/llm/utils/kb.py +0 -34
  64. ws_bom_robot_app-0.0.37.dist-info/METADATA +0 -277
  65. ws_bom_robot_app-0.0.37.dist-info/RECORD +0 -60
  66. {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
1
+ import asyncio, logging, aiohttp
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ from langchain_core.documents import Document
4
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
5
+ from typing import List, Union, Optional
6
+ from pydantic import BaseModel, Field, AliasChoices, field_validator
7
+ import json
8
+ import os
9
+
10
+ class ShopifyParams(BaseModel):
11
+ """
12
+ ShopifyParams is a model that defines the parameters required for Shopify integration.
13
+
14
+ Attributes:
15
+ shop_name (str): The shop name for Shopify.
16
+ access_token (str): The access token for Shopify.
17
+ graphql_query (Union[str, dict]): The GraphQL query string or dict for Shopify.
18
+ """
19
+ shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
20
+ access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
21
+ graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
22
+
23
+ @field_validator('graphql_query')
24
+ @classmethod
25
+ def extract_query_string(cls, v):
26
+ """Extract the query string from dict format if needed"""
27
+ if isinstance(v, dict) and 'query' in v:
28
+ return v['query']
29
+ return v
30
+
31
+ class Shopify(IntegrationStrategy):
32
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
33
+ super().__init__(knowledgebase_path, data)
34
+ self.__data = ShopifyParams.model_validate(self.data)
35
+
36
+ def working_subdirectory(self) -> str:
37
+ return 'shopify'
38
+
39
+ async def run(self) -> None:
40
+ _data = await self.__get_data()
41
+ json_file_path = os.path.join(self.working_directory, 'shopify_data.json')
42
+ with open(json_file_path, 'w', encoding='utf-8') as f:
43
+ json.dump(_data, f, ensure_ascii=False)
44
+
45
+ async def load(self) -> list[Document]:
46
+ await self.run()
47
+ await asyncio.sleep(1)
48
+ return await Loader(self.working_directory).load()
49
+
50
+ async def __get_data(self, page_size: int = 50) -> List[dict]:
51
+ # URL dell'API
52
+ url = f"https://{self.__data.shop_name}.myshopify.com/admin/api/2024-07/graphql.json"
53
+
54
+ # Headers
55
+ headers = {
56
+ "X-Shopify-Access-Token": self.__data.access_token,
57
+ "Content-Type": "application/json"
58
+ }
59
+
60
+ all_data: List[dict] = []
61
+ has_next_page = True
62
+ cursor = None
63
+ retry_count = 0
64
+ max_retries = 5
65
+
66
+ while has_next_page:
67
+ # Variables per la query
68
+ variables = {
69
+ "first": page_size
70
+ }
71
+
72
+ if cursor:
73
+ variables["after"] = cursor
74
+
75
+ # Payload della richiesta
76
+ payload = {
77
+ "query": self.__data.graphql_query,
78
+ "variables": variables
79
+ }
80
+
81
+ try:
82
+ # Effettua la richiesta
83
+ async with aiohttp.ClientSession() as session:
84
+ async with session.post(url, headers=headers, json=payload) as response:
85
+ # Controlla se la risposta è JSON
86
+ try:
87
+ data = await response.json()
88
+ except aiohttp.ContentTypeError:
89
+ text = await response.text()
90
+ logging.error(f"Non-JSON response received. Status code: {response.status}")
91
+ logging.error(f"Content: {text}")
92
+ raise Exception("Invalid response from API")
93
+
94
+ # Gestione del throttling
95
+ if "errors" in data:
96
+ error = data["errors"][0]
97
+ if error.get("extensions", {}).get("code") == "THROTTLED":
98
+ retry_count += 1
99
+ if retry_count > max_retries:
100
+ raise Exception("Too many throttling attempts. Stopping execution.")
101
+
102
+ # Aspetta un po' più a lungo ad ogni tentativo
103
+ wait_time = 2 ** retry_count # Backoff esponenziale
104
+ print(f"Rate limit reached. Waiting {wait_time} seconds... (Attempt {retry_count}/{max_retries})")
105
+ await asyncio.sleep(wait_time)
106
+ continue
107
+ else:
108
+ raise Exception(f"GraphQL errors: {data['errors']}")
109
+
110
+ # Resetta il contatore dei retry se la richiesta è andata bene
111
+ retry_count = 0
112
+
113
+ # Estrae i dati
114
+ _data = list(data["data"].values())[0]
115
+ edges = _data["edges"]
116
+ page_info = _data["pageInfo"]
117
+
118
+ # Aggiungi i dati alla lista
119
+ for edge in edges:
120
+ all_data.append(edge["node"])
121
+
122
+ # Aggiorna il cursore e il flag per la paginazione
123
+ has_next_page = page_info["hasNextPage"]
124
+ cursor = page_info["endCursor"]
125
+
126
+ print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_data)}")
127
+
128
+ # Piccola pausa per evitare di saturare l'API
129
+ await asyncio.sleep(0.1)
130
+
131
+ except aiohttp.ClientError as e:
132
+ logging.error(f"Connection error: {e}")
133
+ retry_count += 1
134
+ if retry_count <= max_retries:
135
+ wait_time = 2 ** retry_count
136
+ logging.warning(f"Retrying in {wait_time} seconds...")
137
+ await asyncio.sleep(wait_time)
138
+ continue
139
+ else:
140
+ raise Exception("Too many network errors. Stopping execution.")
141
+
142
+ logging.info(f"Data retrieval completed! Total data: {len(all_data)}")
143
+ return all_data
@@ -1,3 +1,4 @@
1
+ import sys, asyncio
1
2
  from typing import Any, AsyncGenerator, AsyncIterator
2
3
  import aiofiles
3
4
  import aiofiles.os
@@ -20,6 +21,7 @@ class Sitemap(IntegrationStrategy):
20
21
  data["excludeTag"] (list[str]): default to ["script", "noscript", "style", "head", "header","nav","footer", "iframe"]
21
22
  data["excludeClass"] (list[str]): ["class1", "class2"]
22
23
  data["excludeId"] (list[str]): ["id1", "id2"]
24
+ data["restrictDomain"] (bool): if True, only urls from the same domain will be loaded, default to True
23
25
  """
24
26
  def __init__(self, knowledgebase_path: str, data: dict[str, Any]):
25
27
  super().__init__(knowledgebase_path, data)
@@ -30,6 +32,8 @@ class Sitemap(IntegrationStrategy):
30
32
  self.__exclude_tag: list[str] = self.data.get("excludeTag",[]) # type: ignore
31
33
  self.__exclude_class: list[str] = self.data.get("excludeClass",[]) # type: ignore
32
34
  self.__exclude_id: list[str] = self.data.get("excludeId",[]) # type: ignore
35
+ self.__restrict_to_same_domain: bool = self.data.get("restrictDomain", True) # type: ignore
36
+ self.__header_template = self.data.get("headers", None)
33
37
  def working_subdirectory(self) -> str:
34
38
  return ""
35
39
  def _extract(self, tag: Tag) -> str:
@@ -62,6 +66,8 @@ class Sitemap(IntegrationStrategy):
62
66
  return f"{self.knowledgebase_path}/{url}" if self._is_local(url) else url
63
67
  async def alazy_load(self,loader: SitemapLoader) -> AsyncIterator[Document]:
64
68
  """A lazy loader for Documents."""
69
+ if sys.platform == 'win32':
70
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
65
71
  iterator = await run_in_executor(None, loader.lazy_load)
66
72
  done = object()
67
73
  while True:
@@ -75,7 +81,9 @@ class Sitemap(IntegrationStrategy):
75
81
  web_path=self._remap_if_local(self.__sitemap_url),
76
82
  filter_urls=self.__filter_urls,
77
83
  parsing_function=self._parse,
78
- is_local=self._is_local(self.__sitemap_url)
84
+ is_local=self._is_local(self.__sitemap_url),
85
+ restrict_to_same_domain=self.__restrict_to_same_domain,
86
+ header_template=self.__header_template
79
87
  )
80
88
  _docs = self._output([document async for document in self.alazy_load(_loader)])
81
89
  if self._is_local(self.__sitemap_url):
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
3
+ from unstructured_ingest.interfaces.downloader import DownloaderConfig
4
+ from unstructured_ingest.processes.connectors.slack import SlackIndexerConfig, SlackDownloaderConfig, SlackConnectionConfig, SlackAccessConfig
4
5
  from langchain_core.documents import Document
5
6
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
7
  from typing import Union
@@ -39,7 +40,7 @@ class Slack(IntegrationStrategy):
39
40
  start_date=datetime.now() - timedelta(days=self.__data.num_days),
40
41
  end_date=datetime.now()
41
42
  )
42
- downloader_config = SlackDownloaderConfig(
43
+ downloader_config = DownloaderConfig(
43
44
  download_dir=self.working_directory
44
45
  )
45
46
  connection_config = SlackConnectionConfig(
@@ -0,0 +1,236 @@
1
+ import asyncio, logging, aiohttp
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ from langchain_core.documents import Document
4
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
5
+ from typing import List, Union, Optional
6
+ from pydantic import BaseModel, Field, AliasChoices
7
+ import json
8
+ import os
9
+ import platform
10
+ import pandas as pd
11
+ from io import BytesIO
12
+
13
+ # Fix for Windows event loop issue with aiodns
14
+ if platform.system() == 'Windows':
15
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
16
+
17
+ class ThronParams(BaseModel):
18
+ """
19
+ ThronParams is a model that defines the parameters required for Thron integration.
20
+
21
+ Attributes:
22
+ app_id (str): The application ID for Thron.
23
+ client_id (str): The client ID for Thron.
24
+ client_secret (str): The client secret for Thron.
25
+ """
26
+ organization_name: str = Field(validation_alias=AliasChoices("organizationName","organization_name"))
27
+ attribute_fields: Optional[List[str]] = Field(default=None, validation_alias=AliasChoices("attributeFields","attribute_fields"))
28
+ client_id: str = Field(validation_alias=AliasChoices("clientId","client_id"))
29
+ client_secret: str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
30
+
31
+ class Thron(IntegrationStrategy):
32
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
33
+ super().__init__(knowledgebase_path, data)
34
+ self.__token = None
35
+ self.__data = ThronParams.model_validate(self.data)
36
+
37
+ def working_subdirectory(self) -> str:
38
+ return 'thron'
39
+
40
+ async def __ensure_token(self) -> bool:
41
+ """Ensure we have a valid token, getting one if needed."""
42
+ if not self.__token:
43
+ self.__token = await self.__get_auth_token()
44
+ return self.__token is not None
45
+
46
+ def __convert_xlsx_to_csv(self, file_content: bytes) -> bool:
47
+ """Convert XLSX file content to CSV and save to working directory."""
48
+ try:
49
+ df = pd.read_excel(BytesIO(file_content))
50
+ csv_path = os.path.join(self.working_directory, 'thron_export.csv')
51
+ df.to_csv(csv_path, index=False, encoding='utf-8')
52
+ return True
53
+ except Exception as e:
54
+ logging.error(f"Error converting XLSX to CSV: {e}")
55
+ return False
56
+
57
+ async def run(self) -> None:
58
+ _run_id = await self.__get_data()
59
+ if _run_id:
60
+ await self.__fetch_exported_file(_run_id)
61
+
62
+ async def load(self) -> list[Document]:
63
+ await self.run()
64
+ await asyncio.sleep(1)
65
+ return await Loader(self.working_directory).load()
66
+
67
+ async def __get_auth_token(self) -> str:
68
+ """
69
+ Get authentication token from Thron API.
70
+
71
+ Returns:
72
+ str: The access token if successful, None otherwise.
73
+ """
74
+ try:
75
+ async with aiohttp.ClientSession() as session:
76
+ auth_data = {
77
+ "grant_type": "client_credentials",
78
+ "client_id": self.__data.client_id,
79
+ "client_secret": self.__data.client_secret
80
+ }
81
+ headers = {
82
+ "accept": "application/json",
83
+ "Content-Type": "application/x-www-form-urlencoded"
84
+ }
85
+ async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/authentication/oauth2/token", data=auth_data, headers=headers) as response:
86
+ result = await response.json()
87
+ return result.get("access_token", "")
88
+ except Exception as e:
89
+ logging.error(f"Error fetching Thron auth token: {e}")
90
+ return None
91
+
92
+ async def __refresh_token(self) -> bool:
93
+ """Refresh the authentication token and update the instance variable."""
94
+ try:
95
+ new_token = await self.__get_auth_token()
96
+ if new_token:
97
+ self.__token = new_token
98
+ logging.info("Thron authentication token refreshed successfully.")
99
+ return True
100
+ else:
101
+ logging.error("Failed to refresh Thron authentication token.")
102
+ return False
103
+ except Exception as e:
104
+ logging.error(f"Error refreshing Thron auth token: {e}")
105
+ return False
106
+
107
+ async def __get_data(self) -> str:
108
+ """
109
+ Initiates a data export request to Thron API.
110
+
111
+ Returns:
112
+ str: The export ID if successful, None otherwise.
113
+ """
114
+ max_retries = 2
115
+ retry_count = 0
116
+
117
+ while retry_count < max_retries:
118
+ try:
119
+ if not await self.__ensure_token():
120
+ logging.error("Failed to obtain Thron authentication token.")
121
+ return {}
122
+
123
+ async with aiohttp.ClientSession() as session:
124
+ headers = {
125
+ "accept": "application/json",
126
+ "Authorization": f"Bearer {self.__token}"
127
+ }
128
+ payload = {"attributes": self.__data.attribute_fields or [],"assetsBy":"CODE","type":"CODES","format":"XLSX","locales":[],"systemAttributes":["family","master","variation","variationGroup","hierarchyLevel"]}
129
+ async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports", headers=headers, json=payload) as response:
130
+ # Check for authentication errors
131
+ if response.status == 401:
132
+ logging.warning("Authentication failed in __get_data, attempting to refresh token...")
133
+ if await self.__refresh_token():
134
+ retry_count += 1
135
+ continue
136
+ else:
137
+ logging.error("Token refresh failed in __get_data.")
138
+ return None
139
+
140
+ if response.status not in range(200, 300):
141
+ logging.error(f"API request failed with status {response.status}")
142
+ return None
143
+
144
+ result = await response.json()
145
+ return result.get("id", None)
146
+
147
+ except Exception as e:
148
+ logging.error(f"Error fetching Thron product data (attempt {retry_count + 1}): {e}")
149
+ if retry_count < max_retries - 1:
150
+ if await self.__refresh_token():
151
+ retry_count += 1
152
+ continue
153
+ retry_count += 1
154
+
155
+ logging.error(f"Failed to fetch Thron product data after {max_retries} attempts.")
156
+ return {}
157
+
158
+
159
+ async def __fetch_exported_file(self, export_id: str) -> bool:
160
+ """
161
+ Fetches the exported file from Thron API using the provided export ID.
162
+ Polls the export status until it's processed, then downloads the XLSX file
163
+ and converts it to CSV format in the working directory.
164
+
165
+ Args:
166
+ export_id (str): The ID of the export to fetch.
167
+
168
+ Returns:
169
+ bool: True if file was successfully downloaded and converted, False otherwise.
170
+ """
171
+ max_retries = 2
172
+ retry_count = 0
173
+
174
+ while retry_count < max_retries:
175
+ try:
176
+ # Ensure we have a token
177
+ if not await self.__ensure_token():
178
+ logging.error("Failed to obtain Thron authentication token.")
179
+ return {}
180
+
181
+ async with aiohttp.ClientSession() as session:
182
+ headers = {
183
+ "accept": "application/json",
184
+ "Authorization": f"Bearer {self.__token}"
185
+ }
186
+
187
+ # Polling until status is PROCESSED
188
+ while True:
189
+ async with session.get(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports/{export_id}", headers=headers) as response:
190
+ # Check for authentication errors
191
+ if response.status == 401:
192
+ logging.warning("Authentication failed, attempting to refresh token...")
193
+ if await self.__refresh_token():
194
+ headers["Authorization"] = f"Bearer {self.__token}"
195
+ continue
196
+ else:
197
+ logging.error("Token refresh failed, aborting request.")
198
+ return {}
199
+
200
+ if response.status != 200:
201
+ logging.error(f"API request failed with status {response.status}")
202
+ break
203
+
204
+ result = await response.json()
205
+ if result.get("status") == "PROCESSED":
206
+ download_uri = result.get("downloadUri")
207
+ if download_uri:
208
+ async with session.get(download_uri) as file_response:
209
+ if file_response.status == 200:
210
+ # Download XLSX file
211
+ file_content = await file_response.read()
212
+ return self.__convert_xlsx_to_csv(file_content)
213
+
214
+ elif file_response.status == 401:
215
+ logging.warning("Authentication failed during file download, attempting to refresh token...")
216
+ if await self.__refresh_token():
217
+ retry_count += 1
218
+ break
219
+ else:
220
+ logging.error("Token refresh failed during file download.")
221
+ return False
222
+ break
223
+
224
+ await asyncio.sleep(5)
225
+ return False
226
+
227
+ except Exception as e:
228
+ logging.error(f"Error fetching exported data (attempt {retry_count + 1}): {e}")
229
+ if retry_count < max_retries - 1:
230
+ if await self.__refresh_token():
231
+ retry_count += 1
232
+ continue
233
+ retry_count += 1
234
+
235
+ logging.error(f"Failed to fetch exported data after {max_retries} attempts.")
236
+ return False
@@ -15,6 +15,8 @@ from langchain_community.document_loaders import (
15
15
  UnstructuredImageLoader,
16
16
  UnstructuredWordDocumentLoader,
17
17
  UnstructuredXMLLoader,
18
+ UnstructuredExcelLoader,
19
+ UnstructuredPDFLoader,
18
20
  UnstructuredPowerPointLoader,
19
21
  TextLoader
20
22
  )
@@ -30,9 +32,9 @@ class Loader():
30
32
 
31
33
  _list: dict[str, LoaderConfig | None] = {
32
34
  '.json': LoaderConfig(loader=JsonLoader),
33
- '.csv': LoaderConfig(loader=CSVLoader),
35
+ '.csv': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":CSVLoader}),
34
36
  '.xls': None,
35
- '.xlsx': LoaderConfig(loader=DoclingLoader),
37
+ '.xlsx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredExcelLoader, "strategy":"auto"}),
36
38
  '.eml': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
37
39
  '.msg': LoaderConfig(loader=UnstructuredEmailLoader,kwargs={"strategy":"auto", "process_attachments": False}),
38
40
  '.epub': None,
@@ -47,9 +49,9 @@ class Loader():
47
49
  '.tsv': None,
48
50
  '.text': None,
49
51
  '.log': None,
50
- '.htm': LoaderConfig(loader=BSHTMLLoader),
51
- '.html': LoaderConfig(loader=BSHTMLLoader),
52
- ".pdf": LoaderConfig(loader=DoclingLoader),
52
+ '.htm': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":BSHTMLLoader}),
53
+ '.html': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":BSHTMLLoader}),
54
+ ".pdf": LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredPDFLoader, "strategy":"auto"}),
53
55
  '.png': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
54
56
  '.jpg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
55
57
  '.jpeg': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredImageLoader, "strategy":"auto"}),
@@ -59,7 +61,7 @@ class Loader():
59
61
  '.tiff': None,
60
62
  '.doc': None, #see liberoffice dependency
61
63
  '.docx': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredWordDocumentLoader, "strategy":"auto"}),
62
- '.xml': LoaderConfig(loader=UnstructuredXMLLoader,kwargs={"strategy":"auto"}),
64
+ '.xml': LoaderConfig(loader=DoclingLoader, kwargs={"fallback":UnstructuredXMLLoader, "strategy":"auto"}),
63
65
  '.js': None,
64
66
  '.py': None,
65
67
  '.c': None,
@@ -102,15 +104,54 @@ class Loader():
102
104
  loader_kwargs=loader_config["loader_kwargs"],
103
105
  show_progress=self._runtime_options.loader_show_progress,
104
106
  recursive=True,
105
- silent_errors=self._runtime_options.loader_silent_errors,
107
+ silent_errors=True, #self._runtime_options.loader_silent_errors,
106
108
  use_multithreading=config.robot_loader_max_threads>1,
107
- max_concurrency=config.robot_loader_max_threads
109
+ max_concurrency=config.robot_loader_max_threads,
110
+ #sample_size=200
108
111
  )
109
112
  )
110
113
  return loaders
111
114
 
112
115
  #@timer
113
116
  async def load(self) -> list[Document]:
117
+ #region log
118
+ import warnings
119
+ warnings.filterwarnings("ignore", message=".*pin_memory.*no accelerator is found.*")
120
+ warnings.filterwarnings("ignore", category=UserWarning)
121
+ log_msg_to_ignore = [
122
+ "Going to convert document batch...",
123
+ "Initializing pipeline for",
124
+ "Accelerator device:",
125
+ "detected formats:",
126
+ "The text detection result is empty",
127
+ "RapidOCR returned empty result!",
128
+ ]
129
+ class MessageFilter(logging.Filter):
130
+ def __init__(self, patterns):
131
+ super().__init__()
132
+ self.log_msg_to_ignore = patterns
133
+
134
+ def filter(self, record):
135
+ for pattern in self.log_msg_to_ignore:
136
+ if pattern in record.getMessage():
137
+ return False
138
+ return True
139
+ message_filter = MessageFilter(log_msg_to_ignore)
140
+ loggers_to_filter = [
141
+ 'docling',
142
+ 'docling.document_converter',
143
+ 'docling.datamodel',
144
+ 'docling.datamodel.document',
145
+ 'docling.models',
146
+ 'docling.models.rapidocr_model',
147
+ 'docling.utils.accelerator_utils',
148
+ 'unstructured',
149
+ 'RapidOCR'
150
+ ]
151
+ for logger_name in loggers_to_filter:
152
+ logging.getLogger(logger_name).addFilter(message_filter)
153
+ #endregion log
154
+
114
155
  MAX_RETRIES = 3
115
156
  loaders: MergedDataLoader = MergedDataLoader(self.__directory_loader())
116
157
  try:
@@ -130,5 +171,8 @@ class Loader():
130
171
  finally:
131
172
  del _documents
132
173
  finally:
174
+ # Remove logging filters
175
+ for logger_name in loggers_to_filter:
176
+ logging.getLogger(logger_name).removeFilter(message_filter)
133
177
  del loaders
134
178
  gc.collect()
@@ -4,23 +4,52 @@ from langchain_core.document_loaders import BaseLoader
4
4
  from langchain_core.documents import Document
5
5
  from langchain_core.runnables import run_in_executor
6
6
  from docling.document_converter import DocumentConverter, InputFormat, PdfFormatOption, ImageFormatOption
7
- from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode, TesseractCliOcrOptions
7
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
8
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode
8
9
  from langchain_community.document_loaders import UnstructuredFileLoader
10
+ from ws_bom_robot_app.llm.vector_store.db.base import VectorDBStrategy
11
+ from docling.datamodel.pipeline_options import TableStructureOptions, TableFormerMode, RapidOcrOptions
12
+ from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
13
+
14
+ def _doclingConverter() -> DocumentConverter:
15
+ _pipeline_config = {
16
+ "accelerator_options": AcceleratorOptions(
17
+ device=AcceleratorDevice.AUTO,
18
+ cuda_use_flash_attention2=False,
19
+ ),
20
+ "table_structure_options": TableStructureOptions(mode=TableFormerMode.ACCURATE),
21
+ }
22
+ _base_pipeline_options = PdfPipelineOptions(
23
+ **_pipeline_config,
24
+ do_ocr=False)
25
+ _ocr_pipeline_options = PdfPipelineOptions(
26
+ **_pipeline_config,
27
+ ocr_options=RapidOcrOptions(
28
+ print_verbose=False,
29
+ text_score=0.5,
30
+ #rapidocr_params={"det_use_cuda": True}
31
+ ))
32
+ doc_converter = DocumentConverter(
33
+ format_options={
34
+ InputFormat.PDF: PdfFormatOption(
35
+ pipeline_options=_base_pipeline_options,
36
+ ),
37
+ InputFormat.IMAGE: ImageFormatOption(
38
+ pipeline_options=_ocr_pipeline_options,
39
+ ),
40
+ }
41
+ )
42
+ for frm in [InputFormat.PDF, InputFormat.IMAGE]:
43
+ doc_converter.initialize_pipeline(frm)
44
+ return doc_converter
9
45
 
10
46
  class DoclingLoader(BaseLoader):
47
+ _doc_converter: Optional[DocumentConverter] = None
11
48
  def __init__(self, file_path: str | list[str], **kwargs: Any) -> None:
12
49
  self._file_paths = file_path if isinstance(file_path, list) else [file_path]
13
- self._converter = DocumentConverter(format_options={
14
- InputFormat.PDF: PdfFormatOption(
15
- pipeline_options=PdfPipelineOptions(
16
- table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
17
- )),
18
- InputFormat.IMAGE: ImageFormatOption(
19
- pipeline_options=PdfPipelineOptions(
20
- ocr_options=TesseractCliOcrOptions(lang=["auto"]),
21
- table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
22
- ))
23
- })
50
+ if DoclingLoader._doc_converter is None:
51
+ DoclingLoader._doc_converter = _doclingConverter()
52
+ self._converter = DoclingLoader._doc_converter
24
53
  self._kwargs = kwargs
25
54
  def load(self) -> list[Document]:
26
55
  """Load data into Document objects."""
@@ -37,28 +66,37 @@ class DoclingLoader(BaseLoader):
37
66
  if doc is done:
38
67
  break
39
68
  yield doc # type: ignore[misc]
69
+ def _fallback_loader(self, source: str, error: Exception = None) -> Iterator[Document]:
70
+ if 'fallback' in self._kwargs:
71
+ if issubclass(self._kwargs['fallback'], (BaseLoader, UnstructuredFileLoader)):
72
+ logging.info(f"Using fallback loader {self._kwargs['fallback']} for {source}")
73
+ try:
74
+ loader: Union[BaseLoader, UnstructuredFileLoader] = self._kwargs['fallback'](
75
+ source,
76
+ **{k: v for k, v in self._kwargs.items() if k != 'fallback'}
77
+ )
78
+ yield from loader.lazy_load()
79
+ except Exception as e:
80
+ logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
81
+ else:
82
+ logging.warning(f"Invalid fallback loader {self._kwargs['fallback']}[{type(self._kwargs['fallback'])}] for {source}")
83
+ else:
84
+ logging.warning(f"Failed to load document from {source}: {error}")
40
85
  def lazy_load(self) -> Iterator[Document]:
41
86
  for source in self._file_paths:
42
87
  try:
43
- _result = self._converter.convert(
44
- os.path.abspath(source),
45
- raises_on_error=True)
46
- doc = _result.document
47
- text = doc.export_to_markdown(image_placeholder="")
48
- yield Document(page_content=text, metadata={"source": source})
49
- except Exception as e:
50
- if 'fallback' in self._kwargs:
51
- if issubclass(self._kwargs['fallback'], (BaseLoader, UnstructuredFileLoader)):
52
- logging.info(f"Using fallback loader {self._kwargs['fallback']} for {source}")
53
- try:
54
- loader: Union[BaseLoader, UnstructuredFileLoader] = self._kwargs['fallback'](
55
- source,
56
- **{k: v for k, v in self._kwargs.items() if k != 'fallback'}
57
- )
58
- yield from loader.lazy_load()
59
- except Exception as e:
60
- logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
61
- else:
62
- logging.warning(f"Invalid fallback loader {self._kwargs['fallback']}[{type(self._kwargs['fallback'])}] for {source}")
88
+ #manage only small file with header, preventing header stripping and improper chunking
89
+ if (source.endswith('.csv') or source.endswith('.xlsx')) \
90
+ and 'fallback' in self._kwargs \
91
+ and os.path.getsize(source) > (VectorDBStrategy.MAX_TOKENS_PER_BATCH // 4): #rough token estimate
92
+ yield from self._fallback_loader(source)
63
93
  else:
64
- logging.warning(f"Failed to load document from {source}: {e} | {traceback.format_exc()}")
94
+ _result = self._converter.convert(
95
+ os.path.abspath(source),
96
+ raises_on_error=True)
97
+ doc = _result.document
98
+ text = doc.export_to_markdown(image_placeholder="")
99
+ yield Document(page_content=text, metadata={"source": source})
100
+ except Exception as e:
101
+ yield from self._fallback_loader(source,e)
102
+