ws-bom-robot-app 0.0.99__py3-none-any.whl → 0.0.101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. ws_bom_robot_app/llm/agent_description.py +123 -123
  2. ws_bom_robot_app/llm/agent_handler.py +176 -176
  3. ws_bom_robot_app/llm/agent_lcel.py +111 -50
  4. ws_bom_robot_app/llm/defaut_prompt.py +15 -15
  5. ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -66
  6. ws_bom_robot_app/llm/main.py +159 -158
  7. ws_bom_robot_app/llm/models/api.py +2 -1
  8. ws_bom_robot_app/llm/models/feedback.py +30 -30
  9. ws_bom_robot_app/llm/nebuly_handler.py +185 -185
  10. ws_bom_robot_app/llm/tools/tool_builder.py +68 -68
  11. ws_bom_robot_app/llm/tools/tool_manager.py +332 -332
  12. ws_bom_robot_app/llm/tools/utils.py +41 -41
  13. ws_bom_robot_app/llm/utils/agent.py +34 -34
  14. ws_bom_robot_app/llm/utils/cms.py +114 -114
  15. ws_bom_robot_app/llm/utils/download.py +183 -183
  16. ws_bom_robot_app/llm/utils/print.py +29 -29
  17. ws_bom_robot_app/llm/vector_store/generator.py +137 -137
  18. ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -143
  19. ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -236
  20. ws_bom_robot_app/llm/vector_store/loader/base.py +7 -1
  21. ws_bom_robot_app/llm/vector_store/loader/docling.py +20 -12
  22. ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
  23. {ws_bom_robot_app-0.0.99.dist-info → ws_bom_robot_app-0.0.101.dist-info}/METADATA +364 -364
  24. {ws_bom_robot_app-0.0.99.dist-info → ws_bom_robot_app-0.0.101.dist-info}/RECORD +26 -26
  25. {ws_bom_robot_app-0.0.99.dist-info → ws_bom_robot_app-0.0.101.dist-info}/WHEEL +0 -0
  26. {ws_bom_robot_app-0.0.99.dist-info → ws_bom_robot_app-0.0.101.dist-info}/top_level.txt +0 -0
@@ -1,236 +1,236 @@
1
- import asyncio, logging, aiohttp
2
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
- from langchain_core.documents import Document
4
- from ws_bom_robot_app.llm.vector_store.loader.base import Loader
5
- from typing import List, Union, Optional
6
- from pydantic import BaseModel, Field, AliasChoices
7
- import json
8
- import os
9
- import platform
10
- import pandas as pd
11
- from io import BytesIO
12
-
13
- # Fix for Windows event loop issue with aiodns
14
- if platform.system() == 'Windows':
15
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
16
-
17
- class ThronParams(BaseModel):
18
- """
19
- ThronParams is a model that defines the parameters required for Thron integration.
20
-
21
- Attributes:
22
- app_id (str): The application ID for Thron.
23
- client_id (str): The client ID for Thron.
24
- client_secret (str): The client secret for Thron.
25
- """
26
- organization_name: str = Field(validation_alias=AliasChoices("organizationName","organization_name"))
27
- attribute_fields: Optional[List[str]] = Field(default=None, validation_alias=AliasChoices("attributeFields","attribute_fields"))
28
- client_id: str = Field(validation_alias=AliasChoices("clientId","client_id"))
29
- client_secret: str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
30
-
31
- class Thron(IntegrationStrategy):
32
- def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
33
- super().__init__(knowledgebase_path, data)
34
- self.__token = None
35
- self.__data = ThronParams.model_validate(self.data)
36
-
37
- def working_subdirectory(self) -> str:
38
- return 'thron'
39
-
40
- async def __ensure_token(self) -> bool:
41
- """Ensure we have a valid token, getting one if needed."""
42
- if not self.__token:
43
- self.__token = await self.__get_auth_token()
44
- return self.__token is not None
45
-
46
- def __convert_xlsx_to_csv(self, file_content: bytes) -> bool:
47
- """Convert XLSX file content to CSV and save to working directory."""
48
- try:
49
- df = pd.read_excel(BytesIO(file_content))
50
- csv_path = os.path.join(self.working_directory, 'thron_export.csv')
51
- df.to_csv(csv_path, index=False, encoding='utf-8')
52
- return True
53
- except Exception as e:
54
- logging.error(f"Error converting XLSX to CSV: {e}")
55
- return False
56
-
57
- async def run(self) -> None:
58
- _run_id = await self.__get_data()
59
- if _run_id:
60
- await self.__fetch_exported_file(_run_id)
61
-
62
- async def load(self) -> list[Document]:
63
- await self.run()
64
- await asyncio.sleep(1)
65
- return await Loader(self.working_directory).load()
66
-
67
- async def __get_auth_token(self) -> str:
68
- """
69
- Get authentication token from Thron API.
70
-
71
- Returns:
72
- str: The access token if successful, None otherwise.
73
- """
74
- try:
75
- async with aiohttp.ClientSession() as session:
76
- auth_data = {
77
- "grant_type": "client_credentials",
78
- "client_id": self.__data.client_id,
79
- "client_secret": self.__data.client_secret
80
- }
81
- headers = {
82
- "accept": "application/json",
83
- "Content-Type": "application/x-www-form-urlencoded"
84
- }
85
- async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/authentication/oauth2/token", data=auth_data, headers=headers) as response:
86
- result = await response.json()
87
- return result.get("access_token", "")
88
- except Exception as e:
89
- logging.error(f"Error fetching Thron auth token: {e}")
90
- return None
91
-
92
- async def __refresh_token(self) -> bool:
93
- """Refresh the authentication token and update the instance variable."""
94
- try:
95
- new_token = await self.__get_auth_token()
96
- if new_token:
97
- self.__token = new_token
98
- logging.info("Thron authentication token refreshed successfully.")
99
- return True
100
- else:
101
- logging.error("Failed to refresh Thron authentication token.")
102
- return False
103
- except Exception as e:
104
- logging.error(f"Error refreshing Thron auth token: {e}")
105
- return False
106
-
107
- async def __get_data(self) -> str:
108
- """
109
- Initiates a data export request to Thron API.
110
-
111
- Returns:
112
- str: The export ID if successful, None otherwise.
113
- """
114
- max_retries = 2
115
- retry_count = 0
116
-
117
- while retry_count < max_retries:
118
- try:
119
- if not await self.__ensure_token():
120
- logging.error("Failed to obtain Thron authentication token.")
121
- return {}
122
-
123
- async with aiohttp.ClientSession() as session:
124
- headers = {
125
- "accept": "application/json",
126
- "Authorization": f"Bearer {self.__token}"
127
- }
128
- payload = {"attributes": self.__data.attribute_fields or [],"assetsBy":"CODE","type":"CODES","format":"XLSX","locales":[],"systemAttributes":["family","master","variation","variationGroup","hierarchyLevel"]}
129
- async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports", headers=headers, json=payload) as response:
130
- # Check for authentication errors
131
- if response.status == 401:
132
- logging.warning("Authentication failed in __get_data, attempting to refresh token...")
133
- if await self.__refresh_token():
134
- retry_count += 1
135
- continue
136
- else:
137
- logging.error("Token refresh failed in __get_data.")
138
- return None
139
-
140
- if response.status not in range(200, 300):
141
- logging.error(f"API request failed with status {response.status}")
142
- return None
143
-
144
- result = await response.json()
145
- return result.get("id", None)
146
-
147
- except Exception as e:
148
- logging.error(f"Error fetching Thron product data (attempt {retry_count + 1}): {e}")
149
- if retry_count < max_retries - 1:
150
- if await self.__refresh_token():
151
- retry_count += 1
152
- continue
153
- retry_count += 1
154
-
155
- logging.error(f"Failed to fetch Thron product data after {max_retries} attempts.")
156
- return {}
157
-
158
-
159
- async def __fetch_exported_file(self, export_id: str) -> bool:
160
- """
161
- Fetches the exported file from Thron API using the provided export ID.
162
- Polls the export status until it's processed, then downloads the XLSX file
163
- and converts it to CSV format in the working directory.
164
-
165
- Args:
166
- export_id (str): The ID of the export to fetch.
167
-
168
- Returns:
169
- bool: True if file was successfully downloaded and converted, False otherwise.
170
- """
171
- max_retries = 2
172
- retry_count = 0
173
-
174
- while retry_count < max_retries:
175
- try:
176
- # Ensure we have a token
177
- if not await self.__ensure_token():
178
- logging.error("Failed to obtain Thron authentication token.")
179
- return {}
180
-
181
- async with aiohttp.ClientSession() as session:
182
- headers = {
183
- "accept": "application/json",
184
- "Authorization": f"Bearer {self.__token}"
185
- }
186
-
187
- # Polling until status is PROCESSED
188
- while True:
189
- async with session.get(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports/{export_id}", headers=headers) as response:
190
- # Check for authentication errors
191
- if response.status == 401:
192
- logging.warning("Authentication failed, attempting to refresh token...")
193
- if await self.__refresh_token():
194
- headers["Authorization"] = f"Bearer {self.__token}"
195
- continue
196
- else:
197
- logging.error("Token refresh failed, aborting request.")
198
- return {}
199
-
200
- if response.status != 200:
201
- logging.error(f"API request failed with status {response.status}")
202
- break
203
-
204
- result = await response.json()
205
- if result.get("status") == "PROCESSED":
206
- download_uri = result.get("downloadUri")
207
- if download_uri:
208
- async with session.get(download_uri) as file_response:
209
- if file_response.status == 200:
210
- # Download XLSX file
211
- file_content = await file_response.read()
212
- return self.__convert_xlsx_to_csv(file_content)
213
-
214
- elif file_response.status == 401:
215
- logging.warning("Authentication failed during file download, attempting to refresh token...")
216
- if await self.__refresh_token():
217
- retry_count += 1
218
- break
219
- else:
220
- logging.error("Token refresh failed during file download.")
221
- return False
222
- break
223
-
224
- await asyncio.sleep(5)
225
- return False
226
-
227
- except Exception as e:
228
- logging.error(f"Error fetching exported data (attempt {retry_count + 1}): {e}")
229
- if retry_count < max_retries - 1:
230
- if await self.__refresh_token():
231
- retry_count += 1
232
- continue
233
- retry_count += 1
234
-
235
- logging.error(f"Failed to fetch exported data after {max_retries} attempts.")
236
- return False
1
+ import asyncio, logging, aiohttp
2
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
3
+ from langchain_core.documents import Document
4
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
5
+ from typing import List, Union, Optional
6
+ from pydantic import BaseModel, Field, AliasChoices
7
+ import json
8
+ import os
9
+ import platform
10
+ import pandas as pd
11
+ from io import BytesIO
12
+
13
+ # Fix for Windows event loop issue with aiodns
14
+ if platform.system() == 'Windows':
15
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
16
+
17
+ class ThronParams(BaseModel):
18
+ """
19
+ ThronParams is a model that defines the parameters required for Thron integration.
20
+
21
+ Attributes:
22
+ app_id (str): The application ID for Thron.
23
+ client_id (str): The client ID for Thron.
24
+ client_secret (str): The client secret for Thron.
25
+ """
26
+ organization_name: str = Field(validation_alias=AliasChoices("organizationName","organization_name"))
27
+ attribute_fields: Optional[List[str]] = Field(default=None, validation_alias=AliasChoices("attributeFields","attribute_fields"))
28
+ client_id: str = Field(validation_alias=AliasChoices("clientId","client_id"))
29
+ client_secret: str = Field(validation_alias=AliasChoices("clientSecret","client_secret"))
30
+
31
+ class Thron(IntegrationStrategy):
32
+ def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
33
+ super().__init__(knowledgebase_path, data)
34
+ self.__token = None
35
+ self.__data = ThronParams.model_validate(self.data)
36
+
37
+ def working_subdirectory(self) -> str:
38
+ return 'thron'
39
+
40
+ async def __ensure_token(self) -> bool:
41
+ """Ensure we have a valid token, getting one if needed."""
42
+ if not self.__token:
43
+ self.__token = await self.__get_auth_token()
44
+ return self.__token is not None
45
+
46
+ def __convert_xlsx_to_csv(self, file_content: bytes) -> bool:
47
+ """Convert XLSX file content to CSV and save to working directory."""
48
+ try:
49
+ df = pd.read_excel(BytesIO(file_content))
50
+ csv_path = os.path.join(self.working_directory, 'thron_export.csv')
51
+ df.to_csv(csv_path, index=False, encoding='utf-8')
52
+ return True
53
+ except Exception as e:
54
+ logging.error(f"Error converting XLSX to CSV: {e}")
55
+ return False
56
+
57
+ async def run(self) -> None:
58
+ _run_id = await self.__get_data()
59
+ if _run_id:
60
+ await self.__fetch_exported_file(_run_id)
61
+
62
+ async def load(self) -> list[Document]:
63
+ await self.run()
64
+ await asyncio.sleep(1)
65
+ return await Loader(self.working_directory).load()
66
+
67
+ async def __get_auth_token(self) -> str:
68
+ """
69
+ Get authentication token from Thron API.
70
+
71
+ Returns:
72
+ str: The access token if successful, None otherwise.
73
+ """
74
+ try:
75
+ async with aiohttp.ClientSession() as session:
76
+ auth_data = {
77
+ "grant_type": "client_credentials",
78
+ "client_id": self.__data.client_id,
79
+ "client_secret": self.__data.client_secret
80
+ }
81
+ headers = {
82
+ "accept": "application/json",
83
+ "Content-Type": "application/x-www-form-urlencoded"
84
+ }
85
+ async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/authentication/oauth2/token", data=auth_data, headers=headers) as response:
86
+ result = await response.json()
87
+ return result.get("access_token", "")
88
+ except Exception as e:
89
+ logging.error(f"Error fetching Thron auth token: {e}")
90
+ return None
91
+
92
+ async def __refresh_token(self) -> bool:
93
+ """Refresh the authentication token and update the instance variable."""
94
+ try:
95
+ new_token = await self.__get_auth_token()
96
+ if new_token:
97
+ self.__token = new_token
98
+ logging.info("Thron authentication token refreshed successfully.")
99
+ return True
100
+ else:
101
+ logging.error("Failed to refresh Thron authentication token.")
102
+ return False
103
+ except Exception as e:
104
+ logging.error(f"Error refreshing Thron auth token: {e}")
105
+ return False
106
+
107
+ async def __get_data(self) -> str:
108
+ """
109
+ Initiates a data export request to Thron API.
110
+
111
+ Returns:
112
+ str: The export ID if successful, None otherwise.
113
+ """
114
+ max_retries = 2
115
+ retry_count = 0
116
+
117
+ while retry_count < max_retries:
118
+ try:
119
+ if not await self.__ensure_token():
120
+ logging.error("Failed to obtain Thron authentication token.")
121
+ return {}
122
+
123
+ async with aiohttp.ClientSession() as session:
124
+ headers = {
125
+ "accept": "application/json",
126
+ "Authorization": f"Bearer {self.__token}"
127
+ }
128
+ payload = {"attributes": self.__data.attribute_fields or [],"assetsBy":"CODE","type":"CODES","format":"XLSX","locales":[],"systemAttributes":["family","master","variation","variationGroup","hierarchyLevel"]}
129
+ async with session.post(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports", headers=headers, json=payload) as response:
130
+ # Check for authentication errors
131
+ if response.status == 401:
132
+ logging.warning("Authentication failed in __get_data, attempting to refresh token...")
133
+ if await self.__refresh_token():
134
+ retry_count += 1
135
+ continue
136
+ else:
137
+ logging.error("Token refresh failed in __get_data.")
138
+ return None
139
+
140
+ if response.status not in range(200, 300):
141
+ logging.error(f"API request failed with status {response.status}")
142
+ return None
143
+
144
+ result = await response.json()
145
+ return result.get("id", None)
146
+
147
+ except Exception as e:
148
+ logging.error(f"Error fetching Thron product data (attempt {retry_count + 1}): {e}")
149
+ if retry_count < max_retries - 1:
150
+ if await self.__refresh_token():
151
+ retry_count += 1
152
+ continue
153
+ retry_count += 1
154
+
155
+ logging.error(f"Failed to fetch Thron product data after {max_retries} attempts.")
156
+ return {}
157
+
158
+
159
+ async def __fetch_exported_file(self, export_id: str) -> bool:
160
+ """
161
+ Fetches the exported file from Thron API using the provided export ID.
162
+ Polls the export status until it's processed, then downloads the XLSX file
163
+ and converts it to CSV format in the working directory.
164
+
165
+ Args:
166
+ export_id (str): The ID of the export to fetch.
167
+
168
+ Returns:
169
+ bool: True if file was successfully downloaded and converted, False otherwise.
170
+ """
171
+ max_retries = 2
172
+ retry_count = 0
173
+
174
+ while retry_count < max_retries:
175
+ try:
176
+ # Ensure we have a token
177
+ if not await self.__ensure_token():
178
+ logging.error("Failed to obtain Thron authentication token.")
179
+ return {}
180
+
181
+ async with aiohttp.ClientSession() as session:
182
+ headers = {
183
+ "accept": "application/json",
184
+ "Authorization": f"Bearer {self.__token}"
185
+ }
186
+
187
+ # Polling until status is PROCESSED
188
+ while True:
189
+ async with session.get(f"https://{self.__data.organization_name}.thron.com/api/v1/product-sync/exports/{export_id}", headers=headers) as response:
190
+ # Check for authentication errors
191
+ if response.status == 401:
192
+ logging.warning("Authentication failed, attempting to refresh token...")
193
+ if await self.__refresh_token():
194
+ headers["Authorization"] = f"Bearer {self.__token}"
195
+ continue
196
+ else:
197
+ logging.error("Token refresh failed, aborting request.")
198
+ return {}
199
+
200
+ if response.status != 200:
201
+ logging.error(f"API request failed with status {response.status}")
202
+ break
203
+
204
+ result = await response.json()
205
+ if result.get("status") == "PROCESSED":
206
+ download_uri = result.get("downloadUri")
207
+ if download_uri:
208
+ async with session.get(download_uri) as file_response:
209
+ if file_response.status == 200:
210
+ # Download XLSX file
211
+ file_content = await file_response.read()
212
+ return self.__convert_xlsx_to_csv(file_content)
213
+
214
+ elif file_response.status == 401:
215
+ logging.warning("Authentication failed during file download, attempting to refresh token...")
216
+ if await self.__refresh_token():
217
+ retry_count += 1
218
+ break
219
+ else:
220
+ logging.error("Token refresh failed during file download.")
221
+ return False
222
+ break
223
+
224
+ await asyncio.sleep(5)
225
+ return False
226
+
227
+ except Exception as e:
228
+ logging.error(f"Error fetching exported data (attempt {retry_count + 1}): {e}")
229
+ if retry_count < max_retries - 1:
230
+ if await self.__refresh_token():
231
+ retry_count += 1
232
+ continue
233
+ retry_count += 1
234
+
235
+ logging.error(f"Failed to fetch exported data after {max_retries} attempts.")
236
+ return False
@@ -117,11 +117,14 @@ class Loader():
117
117
  #region log
118
118
  import warnings
119
119
  warnings.filterwarnings("ignore", message=".*pin_memory.*no accelerator is found.*")
120
+ warnings.filterwarnings("ignore", category=UserWarning)
120
121
  log_msg_to_ignore = [
121
122
  "Going to convert document batch...",
122
123
  "Initializing pipeline for",
123
124
  "Accelerator device:",
124
125
  "detected formats:",
126
+ "The text detection result is empty",
127
+ "RapidOCR returned empty result!",
125
128
  ]
126
129
  class MessageFilter(logging.Filter):
127
130
  def __init__(self, patterns):
@@ -139,8 +142,11 @@ class Loader():
139
142
  'docling.document_converter',
140
143
  'docling.datamodel',
141
144
  'docling.datamodel.document',
145
+ 'docling.models',
146
+ 'docling.models.rapidocr_model',
142
147
  'docling.utils.accelerator_utils',
143
- 'unstructured'
148
+ 'unstructured',
149
+ 'RapidOCR'
144
150
  ]
145
151
  for logger_name in loggers_to_filter:
146
152
  logging.getLogger(logger_name).addFilter(message_filter)
@@ -8,31 +8,39 @@ from docling.datamodel.accelerator_options import AcceleratorDevice, Accelerator
8
8
  from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode
9
9
  from langchain_community.document_loaders import UnstructuredFileLoader
10
10
  from ws_bom_robot_app.llm.vector_store.db.base import VectorDBStrategy
11
- from docling.datamodel.pipeline_options import TableStructureOptions, TableFormerMode, EasyOcrOptions
11
+ from docling.datamodel.pipeline_options import TableStructureOptions, TableFormerMode, RapidOcrOptions
12
12
  from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
13
13
 
14
14
  def _doclingConverter() -> DocumentConverter:
15
- pipeline_options = PdfPipelineOptions(
16
- accelerator_options=AcceleratorOptions(
15
+ _pipeline_config = {
16
+ "accelerator_options": AcceleratorOptions(
17
17
  device=AcceleratorDevice.AUTO,
18
18
  cuda_use_flash_attention2=False,
19
19
  ),
20
- table_structure_options=TableStructureOptions(mode=TableFormerMode.FAST),
21
- #do_ocr=False,
22
- ocr_options=EasyOcrOptions()
23
- )
20
+ "table_structure_options": TableStructureOptions(mode=TableFormerMode.ACCURATE),
21
+ }
22
+ _base_pipeline_options = PdfPipelineOptions(
23
+ **_pipeline_config,
24
+ do_ocr=False)
25
+ _ocr_pipeline_options = PdfPipelineOptions(
26
+ **_pipeline_config,
27
+ ocr_options=RapidOcrOptions(
28
+ print_verbose=False,
29
+ text_score=0.5,
30
+ #rapidocr_params={"det_use_cuda": True}
31
+ ))
24
32
  doc_converter = DocumentConverter(
25
- format_options={
33
+ format_options={
26
34
  InputFormat.PDF: PdfFormatOption(
27
- pipeline_options=pipeline_options,
35
+ pipeline_options=_base_pipeline_options,
28
36
  ),
29
37
  InputFormat.IMAGE: ImageFormatOption(
30
- pipeline_options=pipeline_options,
38
+ pipeline_options=_ocr_pipeline_options,
31
39
  ),
32
40
  }
33
41
  )
34
- doc_converter.initialize_pipeline(InputFormat.PDF)
35
- doc_converter.initialize_pipeline(InputFormat.IMAGE)
42
+ for frm in [InputFormat.PDF, InputFormat.IMAGE]:
43
+ doc_converter.initialize_pipeline(frm)
36
44
  return doc_converter
37
45
 
38
46
  class DoclingLoader(BaseLoader):
@@ -1,25 +1,25 @@
1
- import json
2
- from typing import Optional
3
- from langchain_core.documents import Document
4
- from langchain_community.document_loaders.base import BaseLoader
5
-
6
- class JsonLoader(BaseLoader):
7
- def __init__(self, file_path: str, meta_fields:Optional[list[str]] = [],encoding: Optional[str] = "utf-8"):
8
- self.file_path = file_path
9
- self.meta_fields = meta_fields
10
- self.encoding = encoding
11
-
12
- def load(self) -> list[Document]:
13
- with open(self.file_path, "r", encoding=self.encoding) as file:
14
- data = json.load(file)
15
- _list = data if isinstance(data, list) else [data]
16
- return [
17
- Document(
18
- page_content=json.dumps(item),
19
- metadata={
20
- "source": self.file_path,
21
- **{field: item.get(field) for field in self.meta_fields if item.get(field)}
22
- }
23
- )
24
- for item in _list
25
- ]
1
+ import json
2
+ from typing import Optional
3
+ from langchain_core.documents import Document
4
+ from langchain_community.document_loaders.base import BaseLoader
5
+
6
+ class JsonLoader(BaseLoader):
7
+ def __init__(self, file_path: str, meta_fields:Optional[list[str]] = [],encoding: Optional[str] = "utf-8"):
8
+ self.file_path = file_path
9
+ self.meta_fields = meta_fields
10
+ self.encoding = encoding
11
+
12
+ def load(self) -> list[Document]:
13
+ with open(self.file_path, "r", encoding=self.encoding) as file:
14
+ data = json.load(file)
15
+ _list = data if isinstance(data, list) else [data]
16
+ return [
17
+ Document(
18
+ page_content=json.dumps(item),
19
+ metadata={
20
+ "source": self.file_path,
21
+ **{field: item.get(field) for field in self.meta_fields if item.get(field)}
22
+ }
23
+ )
24
+ for item in _list
25
+ ]