ws-bom-robot-app 0.0.95__py3-none-any.whl → 0.0.96__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/llm/vector_store/integration/googledrive.py +45 -16
- {ws_bom_robot_app-0.0.95.dist-info → ws_bom_robot_app-0.0.96.dist-info}/METADATA +1 -1
- {ws_bom_robot_app-0.0.95.dist-info → ws_bom_robot_app-0.0.96.dist-info}/RECORD +5 -5
- {ws_bom_robot_app-0.0.95.dist-info → ws_bom_robot_app-0.0.96.dist-info}/WHEEL +0 -0
- {ws_bom_robot_app-0.0.95.dist-info → ws_bom_robot_app-0.0.96.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,38 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
2
4
|
from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
|
|
3
5
|
from unstructured_ingest.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
|
|
6
|
+
from unstructured_ingest.data_types.file_data import FileData as OriginalFileData, BatchFileData as OriginalBatchFileData
|
|
4
7
|
from langchain_core.documents import Document
|
|
5
8
|
from ws_bom_robot_app.llm.vector_store.loader.base import Loader
|
|
6
9
|
from typing import Union
|
|
7
10
|
from pydantic import BaseModel, Field, AliasChoices
|
|
11
|
+
|
|
12
|
+
# UTF-8 safe FileData classes
|
|
13
|
+
class FileData(OriginalFileData):
|
|
14
|
+
@classmethod
|
|
15
|
+
def from_file(cls, path: str):
|
|
16
|
+
path = Path(path).resolve()
|
|
17
|
+
if not path.exists() or not path.is_file():
|
|
18
|
+
raise ValueError(f"file path not valid: {path}")
|
|
19
|
+
for encoding in ['utf-8', 'cp1252', 'iso-8859-1', 'latin-1']:
|
|
20
|
+
try:
|
|
21
|
+
with open(str(path), "r", encoding=encoding) as f:
|
|
22
|
+
return cls.model_validate(json.load(f))
|
|
23
|
+
except (UnicodeDecodeError, UnicodeError):
|
|
24
|
+
continue
|
|
25
|
+
raise ValueError(f"Could not decode file {path} with any supported encoding")
|
|
26
|
+
|
|
27
|
+
def to_file(self, path: str) -> None:
|
|
28
|
+
path = Path(path).resolve()
|
|
29
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
with open(str(path), "w", encoding="utf-8") as f:
|
|
31
|
+
json.dump(self.model_dump(), f, indent=2, ensure_ascii=False)
|
|
32
|
+
|
|
33
|
+
class BatchFileData(OriginalBatchFileData, FileData):
|
|
34
|
+
pass
|
|
35
|
+
|
|
8
36
|
class GoogleDriveParams(BaseModel):
|
|
9
37
|
"""
|
|
10
38
|
GoogleDriveParams is a model that holds parameters for Google Drive integration.
|
|
@@ -42,26 +70,27 @@ class GoogleDrive(IntegrationStrategy):
|
|
|
42
70
|
super().__init__(knowledgebase_path, data)
|
|
43
71
|
self.__data = GoogleDriveParams.model_validate(self.data)
|
|
44
72
|
self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
|
|
73
|
+
self._apply_encoding_fix()
|
|
74
|
+
|
|
75
|
+
def _apply_encoding_fix(self):
|
|
76
|
+
"""Replace FileData classes with UTF-8 safe versions"""
|
|
77
|
+
import unstructured_ingest.data_types.file_data as fd
|
|
78
|
+
fd.FileData = FileData
|
|
79
|
+
fd.BatchFileData = BatchFileData
|
|
80
|
+
fd.file_data_from_file = lambda path: BatchFileData.from_file(path) if path else FileData.from_file(path)
|
|
81
|
+
|
|
45
82
|
def working_subdirectory(self) -> str:
|
|
46
83
|
return 'googledrive'
|
|
84
|
+
|
|
47
85
|
def run(self) -> None:
|
|
48
|
-
indexer_config = GoogleDriveIndexerConfig(
|
|
49
|
-
extensions=self.__data.extensions,
|
|
50
|
-
recursive=self.__data.recursive
|
|
51
|
-
)
|
|
52
|
-
downloader_config = GoogleDriveDownloaderConfig(
|
|
53
|
-
download_dir=self.working_directory
|
|
54
|
-
)
|
|
55
|
-
connection_config = GoogleDriveConnectionConfig(
|
|
56
|
-
access_config=GoogleDriveAccessConfig(
|
|
57
|
-
service_account_key=self.__data.service_account_key
|
|
58
|
-
),
|
|
59
|
-
drive_id=self.__data.drive_id
|
|
60
|
-
)
|
|
61
86
|
self.__unstructured_ingest.pipeline(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
87
|
+
GoogleDriveIndexerConfig(extensions=self.__data.extensions, recursive=self.__data.recursive),
|
|
88
|
+
GoogleDriveDownloaderConfig(download_dir=self.working_directory),
|
|
89
|
+
GoogleDriveConnectionConfig(
|
|
90
|
+
access_config=GoogleDriveAccessConfig(service_account_key=self.__data.service_account_key),
|
|
91
|
+
drive_id=self.__data.drive_id
|
|
92
|
+
)
|
|
93
|
+
).run()
|
|
65
94
|
async def load(self) -> list[Document]:
|
|
66
95
|
await asyncio.to_thread(self.run)
|
|
67
96
|
await asyncio.sleep(1)
|
|
@@ -55,7 +55,7 @@ ws_bom_robot_app/llm/vector_store/integration/confluence.py,sha256=TMmGe53tHRTgH
|
|
|
55
55
|
ws_bom_robot_app/llm/vector_store/integration/dropbox.py,sha256=vDEVTq7xkXNvpirMkJHm90WzxcSQqCXNc8PBwzLvSH4,2626
|
|
56
56
|
ws_bom_robot_app/llm/vector_store/integration/gcs.py,sha256=P-NKwNag6fkY3bzFvVkAK5Ayl5CKM8T0MvkaFFwSyT0,3181
|
|
57
57
|
ws_bom_robot_app/llm/vector_store/integration/github.py,sha256=1J4Ph3s58ngEIH5HyCMeeD6lVo2GzdU8y41BvPSLZcc,2441
|
|
58
|
-
ws_bom_robot_app/llm/vector_store/integration/googledrive.py,sha256=
|
|
58
|
+
ws_bom_robot_app/llm/vector_store/integration/googledrive.py,sha256=pQQKWsAskg_6FgC4PVmKY1fMvM8BiFxlUVhh5ERBOF4,5016
|
|
59
59
|
ws_bom_robot_app/llm/vector_store/integration/jira.py,sha256=LPxSXPf268FKTS3wnejssDw6_GIpEPJ3QaNgRgPnb60,6718
|
|
60
60
|
ws_bom_robot_app/llm/vector_store/integration/manager.py,sha256=S5z8LK_RcsCmWvLiBX-cea44CpVAXccND47oUOJ0Yus,1898
|
|
61
61
|
ws_bom_robot_app/llm/vector_store/integration/s3.py,sha256=_SAuPfyK7lIz7Jq1LiBavkF1lre5yqe6DGlMYnxMa4o,3317
|
|
@@ -69,7 +69,7 @@ ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
|
|
|
69
69
|
ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=GjUS2oaz0LHOSal5pipBkomZtrYUNcKPSd8bzhUU5Dc,6889
|
|
70
70
|
ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=IOv1A0HSIWiHWQFzI4fdApfxrKgXOqwmC3mPXlKplqQ,4012
|
|
71
71
|
ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=LDppW0ZATo4_1hh-KlsAM3TLawBvwBxva_a7k5Oz1sc,858
|
|
72
|
-
ws_bom_robot_app-0.0.
|
|
73
|
-
ws_bom_robot_app-0.0.
|
|
74
|
-
ws_bom_robot_app-0.0.
|
|
75
|
-
ws_bom_robot_app-0.0.
|
|
72
|
+
ws_bom_robot_app-0.0.96.dist-info/METADATA,sha256=-CABRo25yuOPMqmaE_DrR1AGXZkkAY3LZDSx6jZBYXY,10116
|
|
73
|
+
ws_bom_robot_app-0.0.96.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
74
|
+
ws_bom_robot_app-0.0.96.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
|
|
75
|
+
ws_bom_robot_app-0.0.96.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|