unstructured-ingest 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1,85 +1,43 @@
1
- import json
2
- from dataclasses import dataclass, field
3
- from enum import Enum
4
- from pathlib import Path
5
- from time import time
6
- from typing import TYPE_CHECKING, Any, Generator, Optional
7
- from urllib.parse import quote
1
+ from __future__ import annotations
8
2
 
9
- from pydantic import BaseModel, Field, Secret, SecretStr
3
+ import asyncio
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING, Any, AsyncIterator
10
6
 
11
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
7
+ from pydantic import Field
8
+
9
+ from unstructured_ingest.error import (
10
+ SourceConnectionError,
11
+ SourceConnectionNetworkError,
12
+ )
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.interfaces import (
14
- AccessConfig,
15
- ConnectionConfig,
16
- Downloader,
17
- DownloaderConfig,
18
- DownloadResponse,
19
15
  FileData,
20
- FileDataSourceMetadata,
21
- Indexer,
22
- IndexerConfig,
23
- SourceIdentifiers,
24
16
  )
25
17
  from unstructured_ingest.v2.logger import logger
26
18
  from unstructured_ingest.v2.processes.connector_registry import (
27
19
  SourceRegistryEntry,
28
20
  )
29
-
30
- from .utils import parse_datetime
21
+ from unstructured_ingest.v2.processes.connectors.onedrive import (
22
+ OnedriveAccessConfig,
23
+ OnedriveConnectionConfig,
24
+ OnedriveDownloader,
25
+ OnedriveDownloaderConfig,
26
+ OnedriveIndexer,
27
+ OnedriveIndexerConfig,
28
+ )
31
29
 
32
30
  if TYPE_CHECKING:
33
- from office365.graph_client import GraphClient
34
31
  from office365.onedrive.driveitems.driveItem import DriveItem
35
- from office365.onedrive.drives.drive import Drive
36
- from office365.onedrive.permissions.permission import Permission
37
- from office365.onedrive.sites.site import Site
38
- from office365.sharepoint.client_context import ClientContext
39
- from office365.sharepoint.files.file import File
40
- from office365.sharepoint.folders.folder import Folder
41
- from office365.sharepoint.publishing.pages.page import SitePage
42
32
 
43
33
  CONNECTOR_TYPE = "sharepoint"
44
34
 
45
- MAX_MB_SIZE = 512_000_000
46
-
47
- # TODO handle other data types possible from Sharepoint
48
- # exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
49
-
50
-
51
- class SharepointContentType(Enum):
52
- DOCUMENT = "document"
53
- SITEPAGE = "site_page"
54
- LIST = "list"
55
-
56
35
 
57
- class SharepointAccessConfig(AccessConfig):
58
- client_cred: str = Field(description="Sharepoint app secret")
36
+ class SharepointAccessConfig(OnedriveAccessConfig):
37
+ client_cred: str = Field(description="Microsoft App client secret")
59
38
 
60
39
 
61
- class SharepointPermissionsConfig(BaseModel):
62
- permissions_application_id: Optional[str] = Field(
63
- default=None, description="Microsoft Graph API application id"
64
- )
65
- permissions_tenant: Optional[str] = Field(
66
- default=None,
67
- description="url to get permissions data within tenant.",
68
- examples=["https://contoso.onmicrosoft.com"],
69
- )
70
- permissions_client_cred: Optional[SecretStr] = Field(
71
- default=None, description="Microsoft Graph API application credentials"
72
- )
73
- authority_url: Optional[SecretStr] = Field(
74
- repr=False,
75
- default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
76
- description="Permissions authority url",
77
- examples=["https://login.microsoftonline.com"],
78
- )
79
-
80
-
81
- class SharepointConnectionConfig(ConnectionConfig):
82
- client_id: str = Field(description="Sharepoint app client ID")
40
+ class SharepointConnectionConfig(OnedriveConnectionConfig):
83
41
  site: str = Field(
84
42
  description="Sharepoint site url. Process either base url e.g \
85
43
  https://[tenant].sharepoint.com or relative sites \
@@ -88,355 +46,78 @@ class SharepointConnectionConfig(ConnectionConfig):
88
46
  https://[tenant]-admin.sharepoint.com.\
89
47
  This requires the app to be registered at a tenant level"
90
48
  )
91
- access_config: Secret[SharepointAccessConfig]
92
- permissions_config: Optional[SharepointPermissionsConfig] = None
93
-
94
- @requires_dependencies(["office365"], extras="sharepoint")
95
- def get_client(self) -> "ClientContext":
96
- from office365.runtime.auth.client_credential import ClientCredential
97
- from office365.sharepoint.client_context import ClientContext
98
-
99
- try:
100
- credentials = ClientCredential(
101
- self.client_id, self.access_config.get_secret_value().client_cred
102
- )
103
- site_client = ClientContext(self.site).with_credentials(credentials)
104
- except Exception as e:
105
- logger.error(f"Couldn't set Sharepoint client: {e}")
106
- raise e
107
- return site_client
108
-
109
- @requires_dependencies(["msal"], extras="sharepoint")
110
- def get_permissions_token(self):
111
- from msal import ConfidentialClientApplication
112
-
113
- try:
114
- client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
115
- app = ConfidentialClientApplication(
116
- authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
117
- f"{self.permissions_config.permissions_tenant}",
118
- client_id=self.permissions_config.permissions_application_id,
119
- client_credential=client_credential,
120
- )
121
- token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
122
- except ValueError as exc:
123
- logger.error("Couldn't set up credentials for Sharepoint")
124
- raise exc
125
- if "error" in token:
126
- raise SourceConnectionNetworkError(
127
- "failed to fetch token, {}: {}".format(token["error"], token["error_description"])
128
- )
129
- return token
130
-
131
- @requires_dependencies(["office365"], extras="sharepoint")
132
- def get_permissions_client(self) -> Optional["GraphClient"]:
133
- from office365.graph_client import GraphClient
134
-
135
- if self.permissions_config is None:
136
- return None
137
-
138
- client = GraphClient(self.get_permissions_token)
139
- return client
140
49
 
141
50
 
142
- class SharepointIndexerConfig(IndexerConfig):
143
- path: Optional[str] = Field(
144
- default=None,
145
- description="Path from which to start parsing files. If the connector is to \
146
- process all sites within the tenant this filter will be applied to \
147
- all sites document libraries.",
148
- )
149
- recursive: bool = Field(
150
- default=False,
151
- description="Recursively download files in their respective folders "
152
- "otherwise stop at the files in provided folder level.",
153
- )
154
- omit_files: bool = Field(default=False, description="Don't process files.")
155
- omit_pages: bool = Field(default=False, description="Don't process site pages.")
156
- omit_lists: bool = Field(default=False, description="Don't process lists.")
51
+ class SharepointIndexerConfig(OnedriveIndexerConfig):
52
+ pass
157
53
 
158
54
 
159
55
  @dataclass
160
- class SharepointIndexer(Indexer):
56
+ class SharepointIndexer(OnedriveIndexer):
161
57
  connection_config: SharepointConnectionConfig
162
- index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
163
-
164
- def precheck(self) -> None:
165
- try:
166
- site_client = self.connection_config.get_client()
167
- site_client.site_pages.pages.get().execute_query()
168
- except Exception as e:
169
- logger.error(f"failed to validate connection: {e}", exc_info=True)
170
- raise SourceConnectionError(f"failed to validate connection: {e}")
171
-
172
- def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
173
- if not recursive:
174
- folder.expand(["Files"]).get().execute_query()
175
- return folder.files
176
-
177
- folder.expand(["Files", "Folders"]).get().execute_query()
178
- files: list["File"] = list(folder.files)
179
- folders: list["Folder"] = list(folder.folders)
180
- for f in folders:
181
- if "/Forms" in f.serverRelativeUrl:
182
- continue
183
- files.extend(self.list_files(f, recursive))
184
- return files
185
-
186
- def get_properties(self, raw_properties: dict) -> dict:
187
- raw_properties = {k: v for k, v in raw_properties.items() if v}
188
- filtered_properties = {}
189
- for k, v in raw_properties.items():
190
- try:
191
- json.dumps(v)
192
- filtered_properties[k] = v
193
- except TypeError:
194
- pass
195
- return filtered_properties
196
-
197
- def list_pages(self, client: "ClientContext") -> list["SitePage"]:
198
- pages = client.site_pages.pages.get().execute_query()
199
- return pages
200
-
201
- def page_to_file_data(self, site_page: "SitePage") -> FileData:
202
- site_page.expand(site_page.properties.keys()).get().execute_query()
203
- version = site_page.properties.get("Version", None)
204
- unique_id = site_page.properties.get("UniqueId", None)
205
- modified_date = site_page.properties.get("Modified", None)
206
- url = site_page.properties.get("AbsoluteUrl", None)
207
- date_modified_dt = parse_datetime(modified_date) if modified_date else None
208
- date_created_at = (
209
- parse_datetime(site_page.first_published)
210
- if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
211
- else None
212
- )
213
- file_path = site_page.get_property("Url", "")
214
- server_path = file_path if file_path[0] != "/" else file_path[1:]
215
- additional_metadata = self.get_properties(raw_properties=site_page.properties)
216
- additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
217
- return FileData(
218
- identifier=unique_id,
219
- connector_type=CONNECTOR_TYPE,
220
- source_identifiers=SourceIdentifiers(
221
- filename=site_page.file_name,
222
- fullpath=file_path,
223
- rel_path=file_path.replace(self.index_config.path, ""),
224
- ),
225
- metadata=FileDataSourceMetadata(
226
- url=url,
227
- version=version,
228
- date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
229
- date_created=str(date_created_at.timestamp()) if date_created_at else None,
230
- date_processed=str(time()),
231
- record_locator={
232
- "server_path": server_path,
233
- },
234
- ),
235
- additional_metadata=additional_metadata,
236
- )
237
-
238
- def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
239
- file.expand(file.properties.keys()).get().execute_query()
240
- absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
241
- date_modified_dt = (
242
- parse_datetime(file.time_last_modified) if file.time_last_modified else None
243
- )
244
-
245
- date_created_at = parse_datetime(file.time_created) if file.time_created else None
246
- additional_metadata = self.get_properties(raw_properties=file.properties)
247
- additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
248
- fullpath = str(file.serverRelativeUrl)
249
- rel_path = fullpath.replace(self.index_config.path, "")
250
- while rel_path[0] == "/":
251
- rel_path = rel_path[1:]
252
- return FileData(
253
- identifier=file.unique_id,
254
- connector_type=CONNECTOR_TYPE,
255
- source_identifiers=SourceIdentifiers(
256
- filename=file.name,
257
- fullpath=fullpath,
258
- rel_path=rel_path,
259
- ),
260
- metadata=FileDataSourceMetadata(
261
- url=absolute_url,
262
- version=f"{file.major_version}.{file.minor_version}",
263
- date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
264
- date_created=str(date_created_at.timestamp()) if date_created_at else None,
265
- date_processed=str(time()),
266
- record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
267
- ),
268
- additional_metadata=additional_metadata,
269
- )
270
-
271
- def get_root(self, client: "ClientContext") -> "Folder":
272
- if path := self.index_config.path:
273
- return client.web.get_folder_by_server_relative_path(path)
274
- default_document_library = client.web.default_document_library()
275
- root_folder = default_document_library.root_folder
276
- root_folder = root_folder.get().execute_query()
277
- self.index_config.path = root_folder.name
278
- return root_folder
279
-
280
- def get_site_url(self, client: "ClientContext") -> str:
281
- res = client.web.get().execute_query()
282
- return res.url
283
-
284
- def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
285
- return permissions_client.sites.get_by_url(url=site_url).execute_query()
286
-
287
- def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
288
- # TODO find a way to narrow this search down by name of drive
289
- items: list["DriveItem"] = []
290
- drives: list["Drive"] = site.drives.get_all().execute_query()
291
- for drive in drives:
292
- items.extend(drive.root.children.get_all().execute_query())
293
- return items
58
+ index_config: SharepointIndexerConfig
59
+ connector_type: str = CONNECTOR_TYPE
294
60
 
295
- def map_permission(self, permission: "Permission") -> dict:
296
- return {
297
- "id": permission.id,
298
- "roles": list(permission.roles),
299
- "share_id": permission.share_id,
300
- "has_password": permission.has_password,
301
- "link": permission.link.to_json(),
302
- "granted_to_identities": permission.granted_to_identities.to_json(),
303
- "granted_to": permission.granted_to.to_json(),
304
- "granted_to_v2": permission.granted_to_v2.to_json(),
305
- "granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
306
- "invitation": permission.invitation.to_json(),
307
- }
61
+ @requires_dependencies(["office365"], extras="sharepoint")
62
+ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
63
+ from office365.runtime.client_request_exception import ClientRequestException
64
+
65
+ token_resp = await asyncio.to_thread(self.connection_config.get_token)
66
+ if "error" in token_resp:
67
+ raise SourceConnectionError(
68
+ f"[{self.connector_type}]: {token_resp['error']} "
69
+ f"({token_resp.get('error_description')})"
70
+ )
308
71
 
309
- def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None:
310
- logger.debug("Enriching permissions on files")
311
- permission_client = self.connection_config.get_permissions_client()
312
- if permission_client is None:
313
- return
314
- site = self.get_site(permissions_client=permission_client, site_url=site_url)
315
- existing_items = self.get_permissions_items(site=site)
316
- for file_data in all_file_data:
317
- etag = file_data.additional_metadata.get("ETag")
318
- if not etag:
319
- continue
320
- matching_items = list(filter(lambda x: x.etag == etag, existing_items))
321
- if not matching_items:
322
- continue
323
- if len(matching_items) > 1:
324
- logger.warning(
325
- "Found multiple drive items with etag matching {}, skipping: {}".format(
326
- etag, ", ".join([i.name for i in matching_items])
327
- )
328
- )
329
- continue
330
- matching_item = matching_items[0]
331
- permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
332
- permissions_data = [
333
- self.map_permission(permission=permission) for permission in permissions
334
- ]
335
- file_data.metadata.permissions_data = permissions_data
72
+ client = await asyncio.to_thread(self.connection_config.get_client)
73
+ try:
74
+ site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
75
+ site_drive_item = site.drive.get().execute_query().root
76
+ except ClientRequestException:
77
+ logger.info("Site not found")
336
78
 
337
- @property
338
- def process_permissions(self) -> bool:
339
- return (
340
- self.connection_config.permissions_config is not None
341
- and self.connection_config.permissions_config.permissions_tenant
342
- and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
343
- and self.connection_config.permissions_config.permissions_application_id
79
+ drive_items = await self.list_objects(
80
+ folder=site_drive_item, recursive=self.index_config.recursive
344
81
  )
345
-
346
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
347
- client = self.connection_config.get_client()
348
- root_folder = self.get_root(client=client)
349
- logger.debug(f"processing content from path: {self.index_config.path}")
350
- if not self.index_config.omit_files:
351
- files = self.list_files(root_folder, recursive=self.index_config.recursive)
352
- file_data = [self.file_to_file_data(file=file, client=client) for file in files]
353
- if self.process_permissions:
354
- self.enrich_permissions_on_files(
355
- all_file_data=file_data, site_url=self.get_site_url(client=client)
356
- )
357
- for file in file_data:
358
- yield file
359
- if not self.index_config.omit_pages:
360
- pages = self.list_pages(client=client)
361
- for page in pages:
362
- file_data = self.page_to_file_data(site_page=page)
363
- file_data.metadata.record_locator["site_url"] = client.base_url
364
- yield file_data
82
+ for drive_item in drive_items:
83
+ file_data = await self.drive_item_to_file_data(drive_item=drive_item)
84
+ yield file_data
365
85
 
366
86
 
367
- class SharepointDownloaderConfig(DownloaderConfig):
87
+ class SharepointDownloaderConfig(OnedriveDownloaderConfig):
368
88
  pass
369
89
 
370
90
 
371
91
  @dataclass
372
- class SharepointDownloader(Downloader):
92
+ class SharepointDownloader(OnedriveDownloader):
373
93
  connection_config: SharepointConnectionConfig
374
94
  download_config: SharepointDownloaderConfig
375
95
  connector_type: str = CONNECTOR_TYPE
376
96
 
377
- def get_download_path(self, file_data: FileData) -> Path:
378
- download_path = super().get_download_path(file_data=file_data)
379
-
380
- content_type = file_data.additional_metadata.get("sharepoint_content_type")
381
- if content_type == SharepointContentType.SITEPAGE.value:
382
- # Update output extension to html if site page
383
- download_path = download_path.with_suffix(".html")
384
- return download_path
385
-
386
- def get_document(self, file_data: FileData) -> DownloadResponse:
387
- client: "ClientContext" = self.connection_config.get_client()
388
- file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
389
- download_path = self.get_download_path(file_data=file_data)
390
- download_path.parent.mkdir(parents=True, exist_ok=True)
391
- logger.debug(
392
- f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
393
- )
394
- with download_path.open("wb") as f:
395
- file.download(f).execute_query()
396
- return self.generate_download_response(file_data=file_data, download_path=download_path)
397
-
398
- def get_site_page(self, file_data: FileData) -> DownloadResponse:
399
- # TODO fetch comments for site page as well
400
- from lxml import etree, html
401
-
402
- canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
403
- layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
404
- html_content = []
405
- if layout_web_parts_content_raw:
406
- layout_web_parts_content = json.loads(layout_web_parts_content_raw)
407
- for web_part in layout_web_parts_content:
408
- properties = web_part.get("properties", {})
409
- if title := properties.get("title"):
410
- html_content.append(f"<title>{title}</title>")
411
- if canvas_content_raw:
412
- canvas_content = json.loads(canvas_content_raw)
413
- for content in canvas_content:
414
- if inner_html := content.get("innerHTML"):
415
- html_content.append(inner_html)
416
- htmls = "".join(html_content)
417
- content = f"<div>{htmls}</div>"
418
- document = html.fromstring(content)
419
- download_path = self.get_download_path(file_data=file_data)
420
- download_path.parent.mkdir(parents=True, exist_ok=True)
421
- logger.debug(
422
- f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
423
- )
424
- with download_path.open("w") as f:
425
- f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
426
- return self.generate_download_response(file_data=file_data, download_path=download_path)
97
+ @SourceConnectionNetworkError.wrap
98
+ @requires_dependencies(["office365"], extras="onedrive")
99
+ def _fetch_file(self, file_data: FileData) -> DriveItem:
100
+ from office365.runtime.client_request_exception import ClientRequestException
427
101
 
428
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
429
- content_type = file_data.additional_metadata.get("sharepoint_content_type")
430
- if not content_type:
102
+ if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
431
103
  raise ValueError(
432
- f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}"
104
+ f"file data doesn't have enough information to get "
105
+ f"file content: {file_data.model_dump()}"
433
106
  )
434
- if content_type == SharepointContentType.DOCUMENT.value:
435
- return self.get_document(file_data=file_data)
436
- elif content_type == SharepointContentType.SITEPAGE.value:
437
- return self.get_site_page(file_data=file_data)
438
- else:
439
- raise ValueError(f"content type not recognized: {content_type}")
107
+
108
+ server_relative_path = file_data.source_identifiers.fullpath
109
+ client = self.connection_config.get_client()
110
+
111
+ try:
112
+ site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
113
+ site_drive_item = site.drive.get().execute_query().root
114
+ except ClientRequestException:
115
+ logger.info("Site not found")
116
+ file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
117
+
118
+ if not file:
119
+ raise FileNotFoundError(f"file not found: {server_relative_path}")
120
+ return file
440
121
 
441
122
 
442
123
  sharepoint_source_entry = SourceRegistryEntry(
@@ -18,7 +18,7 @@ class EmbedderConfig(BaseModel):
18
18
  "openai",
19
19
  "azure-openai",
20
20
  "huggingface",
21
- "aws-bedrock",
21
+ "bedrock",
22
22
  "vertexai",
23
23
  "voyageai",
24
24
  "octoai",
@@ -162,7 +162,7 @@ class EmbedderConfig(BaseModel):
162
162
  if self.embedding_provider == "octoai":
163
163
  return self.get_octoai_embedder(embedding_kwargs=kwargs)
164
164
 
165
- if self.embedding_provider == "aws-bedrock":
165
+ if self.embedding_provider == "bedrock":
166
166
  return self.get_bedrock_embedder()
167
167
 
168
168
  if self.embedding_provider == "vertexai":
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from abc import ABC
2
3
  from dataclasses import dataclass
3
4
  from pathlib import Path
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field, SecretStr
7
8
 
8
9
  from unstructured_ingest.utils.data_prep import flatten_dict
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import UserError
10
12
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
13
  from unstructured_ingest.v2.logger import logger
12
14
  from unstructured_ingest.v2.unstructured_api import call_api_async
@@ -73,6 +75,9 @@ class PartitionerConfig(BaseModel):
73
75
  hi_res_model_name: Optional[str] = Field(
74
76
  default=None, description="Model name for hi-res strategy."
75
77
  )
78
+ raise_unsupported_filetype: bool = Field(
79
+ default=False, description="Raise an error if the file type is not supported"
80
+ )
76
81
 
77
82
  def model_post_init(self, __context: Any) -> None:
78
83
  if self.metadata_exclude and self.metadata_include:
@@ -151,13 +156,25 @@ class Partitioner(BaseProcess, ABC):
151
156
  class FileDataSourceMetadata(DataSourceMetadata):
152
157
  filesize_bytes: Optional[int] = None
153
158
 
159
+ metadata = metadata or {}
154
160
  logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
155
161
  logger.debug(f"partitioning file {filename} with metadata {metadata}")
156
- elements = partition(
157
- filename=str(filename.resolve()),
158
- data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
159
- **self.config.to_partition_kwargs(),
160
- )
162
+ try:
163
+ elements = partition(
164
+ filename=str(filename.resolve()),
165
+ data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
166
+ **self.config.to_partition_kwargs(),
167
+ )
168
+ except ValueError as sdk_error:
169
+ if (
170
+ self.is_unstructured_error_unsupported_filetype(sdk_error=sdk_error)
171
+ and not self.config.raise_unsupported_filetype
172
+ ):
173
+ logger.warning(
174
+ f"Unsupported file type for strategy {self.config.strategy}: {filename}"
175
+ )
176
+ return []
177
+ raise sdk_error
161
178
  return self.postprocess(elements=elements_to_dicts(elements))
162
179
 
163
180
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
@@ -179,10 +196,37 @@ class Partitioner(BaseProcess, ABC):
179
196
  element["metadata"]["data_source"] = metadata
180
197
  return self.postprocess(elements=elements)
181
198
 
199
+ def is_unstructured_error_unsupported_filetype(self, sdk_error: ValueError) -> bool:
200
+ error_msg = sdk_error.args[0]
201
+ return (
202
+ "Invalid file" in error_msg
203
+ or "Unstructured schema" in error_msg
204
+ or "fast strategy is not available for image files" in error_msg
205
+ )
206
+
207
+ def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
208
+ error_msg = error.args[0]
209
+ error_dict = json.loads(error_msg)
210
+ details = error_dict["detail"]
211
+ return "fast strategy is not available for image files" in details or (
212
+ "file type" in details.lower() and "is not supported" in details.lower()
213
+ )
214
+
182
215
  def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
183
216
  return self.partition_locally(filename, metadata=metadata, **kwargs)
184
217
 
185
218
  async def run_async(
186
219
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
187
220
  ) -> list[dict]:
188
- return await self.partition_via_api(filename, metadata=metadata, **kwargs)
221
+ try:
222
+ return await self.partition_via_api(filename, metadata=metadata, **kwargs)
223
+ except UserError as user_error:
224
+ if (
225
+ self.is_client_error_unsupported_filetype(error=user_error)
226
+ and not self.config.raise_unsupported_filetype
227
+ ):
228
+ logger.warning(
229
+ f"Unsupported file type for strategy {self.config.strategy}: {filename}"
230
+ )
231
+ return []
232
+ raise user_error