unstructured-ingest 1.0.53__py3-none-any.whl → 1.0.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.53" # pragma: no cover
1
+ __version__ = "1.0.55" # pragma: no cover
@@ -20,6 +20,7 @@ from unstructured_ingest.error import (
20
20
  SourceConnectionError,
21
21
  SourceConnectionNetworkError,
22
22
  )
23
+ from unstructured_ingest.errors_v2 import UserAuthError
23
24
  from unstructured_ingest.interfaces import (
24
25
  AccessConfig,
25
26
  ConnectionConfig,
@@ -114,12 +115,27 @@ class OnedriveConnectionConfig(ConnectionConfig):
114
115
  except ValueError as exc:
115
116
  logger.error("Couldn't set up credentials.")
116
117
  raise exc
118
+
117
119
  if "error" in token:
118
- raise SourceConnectionNetworkError(
119
- "failed to fetch token, {}: {}".format(
120
- token["error"], token["error_description"]
120
+ error_codes = token.get("error_codes", [])
121
+ error_type = token.get("error", "")
122
+ error_description = token.get("error_description", "")
123
+
124
+ # 7000215: Invalid client secret provided
125
+ # 7000218: Invalid client id provided
126
+ # 700016: Application not found in directory
127
+ # 90002: Tenant not found
128
+ auth_error_codes = [7000215, 7000218, 700016, 90002]
129
+
130
+ if (any(code in error_codes for code in auth_error_codes) or
131
+ error_type in ["invalid_client", "unauthorized_client", "invalid_grant"]):
132
+ raise UserAuthError(
133
+ f"Authentication failed: {error_type}: {error_description}"
134
+ )
135
+ else:
136
+ raise SourceConnectionNetworkError(
137
+ f"Failed to fetch token: {error_type}: {error_description}"
121
138
  )
122
- )
123
139
  return token
124
140
 
125
141
  @requires_dependencies(["office365"], extras="onedrive")
@@ -13,6 +13,7 @@ from unstructured_ingest.error import (
13
13
  SourceConnectionError,
14
14
  SourceConnectionNetworkError,
15
15
  )
16
+ from unstructured_ingest.errors_v2 import UserAuthError, UserError
16
17
  from unstructured_ingest.logger import logger
17
18
  from unstructured_ingest.processes.connector_registry import (
18
19
  SourceRegistryEntry,
@@ -30,6 +31,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
30
31
  if TYPE_CHECKING:
31
32
  from office365.onedrive.driveitems.driveItem import DriveItem
32
33
  from office365.onedrive.sites.site import Site
34
+ from office365.runtime.client_request_exception import ClientRequestException
33
35
 
34
36
  CONNECTOR_TYPE = "sharepoint"
35
37
  LEGACY_DEFAULT_PATH = "Shared Documents"
@@ -82,8 +84,8 @@ class SharepointConnectionConfig(OnedriveConnectionConfig):
82
84
 
83
85
 
84
86
  class SharepointIndexerConfig(OnedriveIndexerConfig):
85
- pass
86
-
87
+ # TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
88
+ path: str = Field(default="")
87
89
 
88
90
  @dataclass
89
91
  class SharepointIndexer(OnedriveIndexer):
@@ -91,6 +93,85 @@ class SharepointIndexer(OnedriveIndexer):
91
93
  index_config: SharepointIndexerConfig
92
94
  connector_type: str = CONNECTOR_TYPE
93
95
 
96
+ def _handle_client_request_exception(self, e: ClientRequestException, context: str) -> None:
97
+ """Convert ClientRequestException to appropriate user-facing error based on HTTP status."""
98
+ if hasattr(e, "response") and e.response is not None and hasattr(e.response, "status_code"):
99
+ status_code = e.response.status_code
100
+ if status_code == 401:
101
+ raise UserAuthError(
102
+ f"Unauthorized access to {context}. Check client credentials and permissions"
103
+ )
104
+ elif status_code == 403:
105
+ raise UserAuthError(
106
+ f"Access forbidden to {context}. "
107
+ f"Check app permissions (Sites.Read.All required)"
108
+ )
109
+ elif status_code == 404:
110
+ raise UserError(f"Not found: {context}")
111
+
112
+ raise UserError(f"Failed to access {context}: {str(e)}")
113
+
114
+ def _is_root_path(self, path: str) -> bool:
115
+ """Check if the path represents root access (empty string or legacy default)."""
116
+ return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
117
+
118
+ def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
119
+ """Get the drive item to search in based on the path."""
120
+ if self._is_root_path(path):
121
+ return site_drive_item
122
+ else:
123
+ return site_drive_item.get_by_path(path).get().execute_query()
124
+
125
+ def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
126
+ """Validate that a specific folder path exists and is accessible."""
127
+ from office365.runtime.client_request_exception import ClientRequestException
128
+
129
+ try:
130
+ path_item = site_drive_item.get_by_path(path).get().execute_query()
131
+ if path_item is None or not hasattr(path_item, "is_folder"):
132
+ raise UserError(
133
+ f"SharePoint path '{path}' not found in site {self.connection_config.site}. "
134
+ f"Check that the path exists and you have access to it"
135
+ )
136
+ logger.info(f"SharePoint folder path '{path}' validated successfully")
137
+ except ClientRequestException as e:
138
+ logger.error(f"Failed to access SharePoint path '{path}': {e}")
139
+ self._handle_client_request_exception(e, f"SharePoint path '{path}'")
140
+ except Exception as e:
141
+ logger.error(f"Unexpected error accessing SharePoint path '{path}': {e}")
142
+ raise UserError(f"Failed to validate SharePoint path '{path}': {str(e)}")
143
+
144
+ @requires_dependencies(["office365"], extras="sharepoint")
145
+ def precheck(self) -> None:
146
+ """Validate SharePoint connection before indexing."""
147
+ from office365.runtime.client_request_exception import ClientRequestException
148
+
149
+ # Validate authentication - this call will raise UserAuthError if invalid
150
+ self.connection_config.get_token()
151
+
152
+ try:
153
+ client = self.connection_config.get_client()
154
+ client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
155
+ site_drive_item = self.connection_config._get_drive_item(client_site)
156
+
157
+ path = self.index_config.path
158
+ if not self._is_root_path(path):
159
+ self._validate_folder_path(site_drive_item, path)
160
+
161
+ logger.info(
162
+ f"SharePoint connection validated successfully for site: "
163
+ f"{self.connection_config.site}"
164
+ )
165
+
166
+ except ClientRequestException as e:
167
+ logger.error(f"SharePoint precheck failed for site: {self.connection_config.site}")
168
+ self._handle_client_request_exception(
169
+ e, f"SharePoint site {self.connection_config.site}"
170
+ )
171
+ except Exception as e:
172
+ logger.error(f"Unexpected error during SharePoint precheck: {e}", exc_info=True)
173
+ raise UserError(f"Failed to validate SharePoint connection: {str(e)}")
174
+
94
175
  @requires_dependencies(["office365"], extras="sharepoint")
95
176
  async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
96
177
  from office365.runtime.client_request_exception import ClientRequestException
@@ -113,11 +194,11 @@ class SharepointIndexer(OnedriveIndexer):
113
194
  )
114
195
 
115
196
  path = self.index_config.path
116
- # Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not.
117
- if path and path != LEGACY_DEFAULT_PATH:
118
- site_drive_item = site_drive_item.get_by_path(path).get().execute_query()
197
+ target_drive_item = await asyncio.to_thread(
198
+ self._get_target_drive_item, site_drive_item, path
199
+ )
119
200
 
120
- for drive_item in site_drive_item.get_files(
201
+ for drive_item in target_drive_item.get_files(
121
202
  recursive=self.index_config.recursive
122
203
  ).execute_query():
123
204
  file_data = await self.drive_item_to_file_data(drive_item=drive_item)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.53
3
+ Version: 1.0.55
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=R6_J5XcJv3o4se9ZaxG3ld-O1m1WwU2CyxpqODyEhmg,43
2
+ unstructured_ingest/__version__.py,sha256=WcrHy96lfPCfFMicHHTxBEY2M7zSC_2LVKoyMsYUTrI,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -79,12 +79,12 @@ unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icG
79
79
  unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
80
80
  unstructured_ingest/processes/connectors/mongodb.py,sha256=idjolwS5TXShcIz2jR_socSgh8HOzJwyOnzE1qLUPBw,15362
81
81
  unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
82
- unstructured_ingest/processes/connectors/onedrive.py,sha256=nZt6hsFMlURgB5-BioFBzJySieRVU8xi99QhOCtorxQ,19343
82
+ unstructured_ingest/processes/connectors/onedrive.py,sha256=fGwa-x9D3gyLQtaSXbz6pfiFiLpnO2GVtJmU5kb-qd0,20197
83
83
  unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
84
84
  unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
85
85
  unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
86
86
  unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
87
- unstructured_ingest/processes/connectors/sharepoint.py,sha256=Wgv9Pih9S9FmQJud1bg7kj_qqi55d7QZ48LqlTU_mk0,6509
87
+ unstructured_ingest/processes/connectors/sharepoint.py,sha256=IV6gs4vx4q-QEDwA-Rm6yYCwzopuVl8bKC8CcBU1Lkk,10677
88
88
  unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
89
89
  unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
90
90
  unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
@@ -232,8 +232,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
232
232
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
233
233
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
234
234
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
235
- unstructured_ingest-1.0.53.dist-info/METADATA,sha256=FTZfBOPcfl-pbxoBAIK-d0MwQKfN5nbJXQh92H1QwMs,8842
236
- unstructured_ingest-1.0.53.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
237
- unstructured_ingest-1.0.53.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
238
- unstructured_ingest-1.0.53.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
239
- unstructured_ingest-1.0.53.dist-info/RECORD,,
235
+ unstructured_ingest-1.0.55.dist-info/METADATA,sha256=x-w3d3LQjOuPDVtLQbGgyeCzfMlgVF4OyhGWwm91o8w,8842
236
+ unstructured_ingest-1.0.55.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
237
+ unstructured_ingest-1.0.55.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
238
+ unstructured_ingest-1.0.55.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
239
+ unstructured_ingest-1.0.55.dist-info/RECORD,,