unstructured-ingest 1.0.53__py3-none-any.whl → 1.0.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/processes/connectors/onedrive.py +20 -4
- unstructured_ingest/processes/connectors/sharepoint.py +87 -6
- {unstructured_ingest-1.0.53.dist-info → unstructured_ingest-1.0.55.dist-info}/METADATA +1 -1
- {unstructured_ingest-1.0.53.dist-info → unstructured_ingest-1.0.55.dist-info}/RECORD +8 -8
- {unstructured_ingest-1.0.53.dist-info → unstructured_ingest-1.0.55.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.0.53.dist-info → unstructured_ingest-1.0.55.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.0.53.dist-info → unstructured_ingest-1.0.55.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.0.
|
|
1
|
+
__version__ = "1.0.55" # pragma: no cover
|
|
@@ -20,6 +20,7 @@ from unstructured_ingest.error import (
|
|
|
20
20
|
SourceConnectionError,
|
|
21
21
|
SourceConnectionNetworkError,
|
|
22
22
|
)
|
|
23
|
+
from unstructured_ingest.errors_v2 import UserAuthError
|
|
23
24
|
from unstructured_ingest.interfaces import (
|
|
24
25
|
AccessConfig,
|
|
25
26
|
ConnectionConfig,
|
|
@@ -114,12 +115,27 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
114
115
|
except ValueError as exc:
|
|
115
116
|
logger.error("Couldn't set up credentials.")
|
|
116
117
|
raise exc
|
|
118
|
+
|
|
117
119
|
if "error" in token:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
120
|
+
error_codes = token.get("error_codes", [])
|
|
121
|
+
error_type = token.get("error", "")
|
|
122
|
+
error_description = token.get("error_description", "")
|
|
123
|
+
|
|
124
|
+
# 7000215: Invalid client secret provided
|
|
125
|
+
# 7000218: Invalid client id provided
|
|
126
|
+
# 700016: Application not found in directory
|
|
127
|
+
# 90002: Tenant not found
|
|
128
|
+
auth_error_codes = [7000215, 7000218, 700016, 90002]
|
|
129
|
+
|
|
130
|
+
if (any(code in error_codes for code in auth_error_codes) or
|
|
131
|
+
error_type in ["invalid_client", "unauthorized_client", "invalid_grant"]):
|
|
132
|
+
raise UserAuthError(
|
|
133
|
+
f"Authentication failed: {error_type}: {error_description}"
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
raise SourceConnectionNetworkError(
|
|
137
|
+
f"Failed to fetch token: {error_type}: {error_description}"
|
|
121
138
|
)
|
|
122
|
-
)
|
|
123
139
|
return token
|
|
124
140
|
|
|
125
141
|
@requires_dependencies(["office365"], extras="onedrive")
|
|
@@ -13,6 +13,7 @@ from unstructured_ingest.error import (
|
|
|
13
13
|
SourceConnectionError,
|
|
14
14
|
SourceConnectionNetworkError,
|
|
15
15
|
)
|
|
16
|
+
from unstructured_ingest.errors_v2 import UserAuthError, UserError
|
|
16
17
|
from unstructured_ingest.logger import logger
|
|
17
18
|
from unstructured_ingest.processes.connector_registry import (
|
|
18
19
|
SourceRegistryEntry,
|
|
@@ -30,6 +31,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
30
31
|
if TYPE_CHECKING:
|
|
31
32
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
32
33
|
from office365.onedrive.sites.site import Site
|
|
34
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
33
35
|
|
|
34
36
|
CONNECTOR_TYPE = "sharepoint"
|
|
35
37
|
LEGACY_DEFAULT_PATH = "Shared Documents"
|
|
@@ -82,8 +84,8 @@ class SharepointConnectionConfig(OnedriveConnectionConfig):
|
|
|
82
84
|
|
|
83
85
|
|
|
84
86
|
class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
85
|
-
|
|
86
|
-
|
|
87
|
+
# TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
|
|
88
|
+
path: str = Field(default="")
|
|
87
89
|
|
|
88
90
|
@dataclass
|
|
89
91
|
class SharepointIndexer(OnedriveIndexer):
|
|
@@ -91,6 +93,85 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
91
93
|
index_config: SharepointIndexerConfig
|
|
92
94
|
connector_type: str = CONNECTOR_TYPE
|
|
93
95
|
|
|
96
|
+
def _handle_client_request_exception(self, e: ClientRequestException, context: str) -> None:
|
|
97
|
+
"""Convert ClientRequestException to appropriate user-facing error based on HTTP status."""
|
|
98
|
+
if hasattr(e, "response") and e.response is not None and hasattr(e.response, "status_code"):
|
|
99
|
+
status_code = e.response.status_code
|
|
100
|
+
if status_code == 401:
|
|
101
|
+
raise UserAuthError(
|
|
102
|
+
f"Unauthorized access to {context}. Check client credentials and permissions"
|
|
103
|
+
)
|
|
104
|
+
elif status_code == 403:
|
|
105
|
+
raise UserAuthError(
|
|
106
|
+
f"Access forbidden to {context}. "
|
|
107
|
+
f"Check app permissions (Sites.Read.All required)"
|
|
108
|
+
)
|
|
109
|
+
elif status_code == 404:
|
|
110
|
+
raise UserError(f"Not found: {context}")
|
|
111
|
+
|
|
112
|
+
raise UserError(f"Failed to access {context}: {str(e)}")
|
|
113
|
+
|
|
114
|
+
def _is_root_path(self, path: str) -> bool:
|
|
115
|
+
"""Check if the path represents root access (empty string or legacy default)."""
|
|
116
|
+
return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
|
|
117
|
+
|
|
118
|
+
def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
|
|
119
|
+
"""Get the drive item to search in based on the path."""
|
|
120
|
+
if self._is_root_path(path):
|
|
121
|
+
return site_drive_item
|
|
122
|
+
else:
|
|
123
|
+
return site_drive_item.get_by_path(path).get().execute_query()
|
|
124
|
+
|
|
125
|
+
def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
|
|
126
|
+
"""Validate that a specific folder path exists and is accessible."""
|
|
127
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
path_item = site_drive_item.get_by_path(path).get().execute_query()
|
|
131
|
+
if path_item is None or not hasattr(path_item, "is_folder"):
|
|
132
|
+
raise UserError(
|
|
133
|
+
f"SharePoint path '{path}' not found in site {self.connection_config.site}. "
|
|
134
|
+
f"Check that the path exists and you have access to it"
|
|
135
|
+
)
|
|
136
|
+
logger.info(f"SharePoint folder path '{path}' validated successfully")
|
|
137
|
+
except ClientRequestException as e:
|
|
138
|
+
logger.error(f"Failed to access SharePoint path '{path}': {e}")
|
|
139
|
+
self._handle_client_request_exception(e, f"SharePoint path '{path}'")
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f"Unexpected error accessing SharePoint path '{path}': {e}")
|
|
142
|
+
raise UserError(f"Failed to validate SharePoint path '{path}': {str(e)}")
|
|
143
|
+
|
|
144
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
145
|
+
def precheck(self) -> None:
|
|
146
|
+
"""Validate SharePoint connection before indexing."""
|
|
147
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
148
|
+
|
|
149
|
+
# Validate authentication - this call will raise UserAuthError if invalid
|
|
150
|
+
self.connection_config.get_token()
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
client = self.connection_config.get_client()
|
|
154
|
+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
155
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
156
|
+
|
|
157
|
+
path = self.index_config.path
|
|
158
|
+
if not self._is_root_path(path):
|
|
159
|
+
self._validate_folder_path(site_drive_item, path)
|
|
160
|
+
|
|
161
|
+
logger.info(
|
|
162
|
+
f"SharePoint connection validated successfully for site: "
|
|
163
|
+
f"{self.connection_config.site}"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
except ClientRequestException as e:
|
|
167
|
+
logger.error(f"SharePoint precheck failed for site: {self.connection_config.site}")
|
|
168
|
+
self._handle_client_request_exception(
|
|
169
|
+
e, f"SharePoint site {self.connection_config.site}"
|
|
170
|
+
)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.error(f"Unexpected error during SharePoint precheck: {e}", exc_info=True)
|
|
173
|
+
raise UserError(f"Failed to validate SharePoint connection: {str(e)}")
|
|
174
|
+
|
|
94
175
|
@requires_dependencies(["office365"], extras="sharepoint")
|
|
95
176
|
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
|
|
96
177
|
from office365.runtime.client_request_exception import ClientRequestException
|
|
@@ -113,11 +194,11 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
113
194
|
)
|
|
114
195
|
|
|
115
196
|
path = self.index_config.path
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
197
|
+
target_drive_item = await asyncio.to_thread(
|
|
198
|
+
self._get_target_drive_item, site_drive_item, path
|
|
199
|
+
)
|
|
119
200
|
|
|
120
|
-
for drive_item in
|
|
201
|
+
for drive_item in target_drive_item.get_files(
|
|
121
202
|
recursive=self.index_config.recursive
|
|
122
203
|
).execute_query():
|
|
123
204
|
file_data = await self.drive_item_to_file_data(drive_item=drive_item)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=WcrHy96lfPCfFMicHHTxBEY2M7zSC_2LVKoyMsYUTrI,43
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
5
5
|
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
@@ -79,12 +79,12 @@ unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icG
|
|
|
79
79
|
unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
|
|
80
80
|
unstructured_ingest/processes/connectors/mongodb.py,sha256=idjolwS5TXShcIz2jR_socSgh8HOzJwyOnzE1qLUPBw,15362
|
|
81
81
|
unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
|
|
82
|
-
unstructured_ingest/processes/connectors/onedrive.py,sha256=
|
|
82
|
+
unstructured_ingest/processes/connectors/onedrive.py,sha256=fGwa-x9D3gyLQtaSXbz6pfiFiLpnO2GVtJmU5kb-qd0,20197
|
|
83
83
|
unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
|
|
84
84
|
unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
|
|
85
85
|
unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
|
|
86
86
|
unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
|
|
87
|
-
unstructured_ingest/processes/connectors/sharepoint.py,sha256=
|
|
87
|
+
unstructured_ingest/processes/connectors/sharepoint.py,sha256=IV6gs4vx4q-QEDwA-Rm6yYCwzopuVl8bKC8CcBU1Lkk,10677
|
|
88
88
|
unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
|
|
89
89
|
unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
90
90
|
unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
|
|
@@ -232,8 +232,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
|
|
|
232
232
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
233
233
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
234
234
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
235
|
-
unstructured_ingest-1.0.
|
|
236
|
-
unstructured_ingest-1.0.
|
|
237
|
-
unstructured_ingest-1.0.
|
|
238
|
-
unstructured_ingest-1.0.
|
|
239
|
-
unstructured_ingest-1.0.
|
|
235
|
+
unstructured_ingest-1.0.55.dist-info/METADATA,sha256=x-w3d3LQjOuPDVtLQbGgyeCzfMlgVF4OyhGWwm91o8w,8842
|
|
236
|
+
unstructured_ingest-1.0.55.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
237
|
+
unstructured_ingest-1.0.55.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
238
|
+
unstructured_ingest-1.0.55.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
239
|
+
unstructured_ingest-1.0.55.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-1.0.53.dist-info → unstructured_ingest-1.0.55.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.53.dist-info → unstructured_ingest-1.0.55.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|