unstructured-ingest 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_onedrive.py +57 -10
- test/integration/connectors/test_sharepoint.py +71 -0
- test/integration/connectors/utils/validation/source.py +45 -16
- test/integration/embedders/test_bedrock.py +1 -1
- test/integration/partitioners/test_partitioner.py +10 -9
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/embed/azure_openai.py +21 -2
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +4 -3
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -3
- unstructured_ingest/v2/processes/connectors/sharepoint.py +70 -389
- unstructured_ingest/v2/processes/embedder.py +2 -2
- unstructured_ingest/v2/processes/partitioner.py +50 -6
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/METADATA +19 -19
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/RECORD +21 -19
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,85 +1,43 @@
|
|
|
1
|
-
import
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
|
-
from enum import Enum
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from time import time
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
-
from urllib.parse import quote
|
|
1
|
+
from __future__ import annotations
|
|
8
2
|
|
|
9
|
-
|
|
3
|
+
import asyncio
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator
|
|
10
6
|
|
|
11
|
-
from
|
|
7
|
+
from pydantic import Field
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.error import (
|
|
10
|
+
SourceConnectionError,
|
|
11
|
+
SourceConnectionNetworkError,
|
|
12
|
+
)
|
|
12
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
14
|
from unstructured_ingest.v2.interfaces import (
|
|
14
|
-
AccessConfig,
|
|
15
|
-
ConnectionConfig,
|
|
16
|
-
Downloader,
|
|
17
|
-
DownloaderConfig,
|
|
18
|
-
DownloadResponse,
|
|
19
15
|
FileData,
|
|
20
|
-
FileDataSourceMetadata,
|
|
21
|
-
Indexer,
|
|
22
|
-
IndexerConfig,
|
|
23
|
-
SourceIdentifiers,
|
|
24
16
|
)
|
|
25
17
|
from unstructured_ingest.v2.logger import logger
|
|
26
18
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
27
19
|
SourceRegistryEntry,
|
|
28
20
|
)
|
|
29
|
-
|
|
30
|
-
|
|
21
|
+
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
22
|
+
OnedriveAccessConfig,
|
|
23
|
+
OnedriveConnectionConfig,
|
|
24
|
+
OnedriveDownloader,
|
|
25
|
+
OnedriveDownloaderConfig,
|
|
26
|
+
OnedriveIndexer,
|
|
27
|
+
OnedriveIndexerConfig,
|
|
28
|
+
)
|
|
31
29
|
|
|
32
30
|
if TYPE_CHECKING:
|
|
33
|
-
from office365.graph_client import GraphClient
|
|
34
31
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
35
|
-
from office365.onedrive.drives.drive import Drive
|
|
36
|
-
from office365.onedrive.permissions.permission import Permission
|
|
37
|
-
from office365.onedrive.sites.site import Site
|
|
38
|
-
from office365.sharepoint.client_context import ClientContext
|
|
39
|
-
from office365.sharepoint.files.file import File
|
|
40
|
-
from office365.sharepoint.folders.folder import Folder
|
|
41
|
-
from office365.sharepoint.publishing.pages.page import SitePage
|
|
42
32
|
|
|
43
33
|
CONNECTOR_TYPE = "sharepoint"
|
|
44
34
|
|
|
45
|
-
MAX_MB_SIZE = 512_000_000
|
|
46
|
-
|
|
47
|
-
# TODO handle other data types possible from Sharepoint
|
|
48
|
-
# exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class SharepointContentType(Enum):
|
|
52
|
-
DOCUMENT = "document"
|
|
53
|
-
SITEPAGE = "site_page"
|
|
54
|
-
LIST = "list"
|
|
55
|
-
|
|
56
35
|
|
|
57
|
-
class SharepointAccessConfig(
|
|
58
|
-
client_cred: str = Field(description="
|
|
36
|
+
class SharepointAccessConfig(OnedriveAccessConfig):
|
|
37
|
+
client_cred: str = Field(description="Microsoft App client secret")
|
|
59
38
|
|
|
60
39
|
|
|
61
|
-
class
|
|
62
|
-
permissions_application_id: Optional[str] = Field(
|
|
63
|
-
default=None, description="Microsoft Graph API application id"
|
|
64
|
-
)
|
|
65
|
-
permissions_tenant: Optional[str] = Field(
|
|
66
|
-
default=None,
|
|
67
|
-
description="url to get permissions data within tenant.",
|
|
68
|
-
examples=["https://contoso.onmicrosoft.com"],
|
|
69
|
-
)
|
|
70
|
-
permissions_client_cred: Optional[SecretStr] = Field(
|
|
71
|
-
default=None, description="Microsoft Graph API application credentials"
|
|
72
|
-
)
|
|
73
|
-
authority_url: Optional[SecretStr] = Field(
|
|
74
|
-
repr=False,
|
|
75
|
-
default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
|
|
76
|
-
description="Permissions authority url",
|
|
77
|
-
examples=["https://login.microsoftonline.com"],
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class SharepointConnectionConfig(ConnectionConfig):
|
|
82
|
-
client_id: str = Field(description="Sharepoint app client ID")
|
|
40
|
+
class SharepointConnectionConfig(OnedriveConnectionConfig):
|
|
83
41
|
site: str = Field(
|
|
84
42
|
description="Sharepoint site url. Process either base url e.g \
|
|
85
43
|
https://[tenant].sharepoint.com or relative sites \
|
|
@@ -88,355 +46,78 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
88
46
|
https://[tenant]-admin.sharepoint.com.\
|
|
89
47
|
This requires the app to be registered at a tenant level"
|
|
90
48
|
)
|
|
91
|
-
access_config: Secret[SharepointAccessConfig]
|
|
92
|
-
permissions_config: Optional[SharepointPermissionsConfig] = None
|
|
93
|
-
|
|
94
|
-
@requires_dependencies(["office365"], extras="sharepoint")
|
|
95
|
-
def get_client(self) -> "ClientContext":
|
|
96
|
-
from office365.runtime.auth.client_credential import ClientCredential
|
|
97
|
-
from office365.sharepoint.client_context import ClientContext
|
|
98
|
-
|
|
99
|
-
try:
|
|
100
|
-
credentials = ClientCredential(
|
|
101
|
-
self.client_id, self.access_config.get_secret_value().client_cred
|
|
102
|
-
)
|
|
103
|
-
site_client = ClientContext(self.site).with_credentials(credentials)
|
|
104
|
-
except Exception as e:
|
|
105
|
-
logger.error(f"Couldn't set Sharepoint client: {e}")
|
|
106
|
-
raise e
|
|
107
|
-
return site_client
|
|
108
|
-
|
|
109
|
-
@requires_dependencies(["msal"], extras="sharepoint")
|
|
110
|
-
def get_permissions_token(self):
|
|
111
|
-
from msal import ConfidentialClientApplication
|
|
112
|
-
|
|
113
|
-
try:
|
|
114
|
-
client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
|
|
115
|
-
app = ConfidentialClientApplication(
|
|
116
|
-
authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
|
|
117
|
-
f"{self.permissions_config.permissions_tenant}",
|
|
118
|
-
client_id=self.permissions_config.permissions_application_id,
|
|
119
|
-
client_credential=client_credential,
|
|
120
|
-
)
|
|
121
|
-
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
122
|
-
except ValueError as exc:
|
|
123
|
-
logger.error("Couldn't set up credentials for Sharepoint")
|
|
124
|
-
raise exc
|
|
125
|
-
if "error" in token:
|
|
126
|
-
raise SourceConnectionNetworkError(
|
|
127
|
-
"failed to fetch token, {}: {}".format(token["error"], token["error_description"])
|
|
128
|
-
)
|
|
129
|
-
return token
|
|
130
|
-
|
|
131
|
-
@requires_dependencies(["office365"], extras="sharepoint")
|
|
132
|
-
def get_permissions_client(self) -> Optional["GraphClient"]:
|
|
133
|
-
from office365.graph_client import GraphClient
|
|
134
|
-
|
|
135
|
-
if self.permissions_config is None:
|
|
136
|
-
return None
|
|
137
|
-
|
|
138
|
-
client = GraphClient(self.get_permissions_token)
|
|
139
|
-
return client
|
|
140
49
|
|
|
141
50
|
|
|
142
|
-
class SharepointIndexerConfig(
|
|
143
|
-
|
|
144
|
-
default=None,
|
|
145
|
-
description="Path from which to start parsing files. If the connector is to \
|
|
146
|
-
process all sites within the tenant this filter will be applied to \
|
|
147
|
-
all sites document libraries.",
|
|
148
|
-
)
|
|
149
|
-
recursive: bool = Field(
|
|
150
|
-
default=False,
|
|
151
|
-
description="Recursively download files in their respective folders "
|
|
152
|
-
"otherwise stop at the files in provided folder level.",
|
|
153
|
-
)
|
|
154
|
-
omit_files: bool = Field(default=False, description="Don't process files.")
|
|
155
|
-
omit_pages: bool = Field(default=False, description="Don't process site pages.")
|
|
156
|
-
omit_lists: bool = Field(default=False, description="Don't process lists.")
|
|
51
|
+
class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
52
|
+
pass
|
|
157
53
|
|
|
158
54
|
|
|
159
55
|
@dataclass
|
|
160
|
-
class SharepointIndexer(
|
|
56
|
+
class SharepointIndexer(OnedriveIndexer):
|
|
161
57
|
connection_config: SharepointConnectionConfig
|
|
162
|
-
index_config: SharepointIndexerConfig
|
|
163
|
-
|
|
164
|
-
def precheck(self) -> None:
|
|
165
|
-
try:
|
|
166
|
-
site_client = self.connection_config.get_client()
|
|
167
|
-
site_client.site_pages.pages.get().execute_query()
|
|
168
|
-
except Exception as e:
|
|
169
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
170
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
171
|
-
|
|
172
|
-
def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
|
|
173
|
-
if not recursive:
|
|
174
|
-
folder.expand(["Files"]).get().execute_query()
|
|
175
|
-
return folder.files
|
|
176
|
-
|
|
177
|
-
folder.expand(["Files", "Folders"]).get().execute_query()
|
|
178
|
-
files: list["File"] = list(folder.files)
|
|
179
|
-
folders: list["Folder"] = list(folder.folders)
|
|
180
|
-
for f in folders:
|
|
181
|
-
if "/Forms" in f.serverRelativeUrl:
|
|
182
|
-
continue
|
|
183
|
-
files.extend(self.list_files(f, recursive))
|
|
184
|
-
return files
|
|
185
|
-
|
|
186
|
-
def get_properties(self, raw_properties: dict) -> dict:
|
|
187
|
-
raw_properties = {k: v for k, v in raw_properties.items() if v}
|
|
188
|
-
filtered_properties = {}
|
|
189
|
-
for k, v in raw_properties.items():
|
|
190
|
-
try:
|
|
191
|
-
json.dumps(v)
|
|
192
|
-
filtered_properties[k] = v
|
|
193
|
-
except TypeError:
|
|
194
|
-
pass
|
|
195
|
-
return filtered_properties
|
|
196
|
-
|
|
197
|
-
def list_pages(self, client: "ClientContext") -> list["SitePage"]:
|
|
198
|
-
pages = client.site_pages.pages.get().execute_query()
|
|
199
|
-
return pages
|
|
200
|
-
|
|
201
|
-
def page_to_file_data(self, site_page: "SitePage") -> FileData:
|
|
202
|
-
site_page.expand(site_page.properties.keys()).get().execute_query()
|
|
203
|
-
version = site_page.properties.get("Version", None)
|
|
204
|
-
unique_id = site_page.properties.get("UniqueId", None)
|
|
205
|
-
modified_date = site_page.properties.get("Modified", None)
|
|
206
|
-
url = site_page.properties.get("AbsoluteUrl", None)
|
|
207
|
-
date_modified_dt = parse_datetime(modified_date) if modified_date else None
|
|
208
|
-
date_created_at = (
|
|
209
|
-
parse_datetime(site_page.first_published)
|
|
210
|
-
if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
|
|
211
|
-
else None
|
|
212
|
-
)
|
|
213
|
-
file_path = site_page.get_property("Url", "")
|
|
214
|
-
server_path = file_path if file_path[0] != "/" else file_path[1:]
|
|
215
|
-
additional_metadata = self.get_properties(raw_properties=site_page.properties)
|
|
216
|
-
additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
|
|
217
|
-
return FileData(
|
|
218
|
-
identifier=unique_id,
|
|
219
|
-
connector_type=CONNECTOR_TYPE,
|
|
220
|
-
source_identifiers=SourceIdentifiers(
|
|
221
|
-
filename=site_page.file_name,
|
|
222
|
-
fullpath=file_path,
|
|
223
|
-
rel_path=file_path.replace(self.index_config.path, ""),
|
|
224
|
-
),
|
|
225
|
-
metadata=FileDataSourceMetadata(
|
|
226
|
-
url=url,
|
|
227
|
-
version=version,
|
|
228
|
-
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
229
|
-
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
230
|
-
date_processed=str(time()),
|
|
231
|
-
record_locator={
|
|
232
|
-
"server_path": server_path,
|
|
233
|
-
},
|
|
234
|
-
),
|
|
235
|
-
additional_metadata=additional_metadata,
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
|
|
239
|
-
file.expand(file.properties.keys()).get().execute_query()
|
|
240
|
-
absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
|
|
241
|
-
date_modified_dt = (
|
|
242
|
-
parse_datetime(file.time_last_modified) if file.time_last_modified else None
|
|
243
|
-
)
|
|
244
|
-
|
|
245
|
-
date_created_at = parse_datetime(file.time_created) if file.time_created else None
|
|
246
|
-
additional_metadata = self.get_properties(raw_properties=file.properties)
|
|
247
|
-
additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
|
|
248
|
-
fullpath = str(file.serverRelativeUrl)
|
|
249
|
-
rel_path = fullpath.replace(self.index_config.path, "")
|
|
250
|
-
while rel_path[0] == "/":
|
|
251
|
-
rel_path = rel_path[1:]
|
|
252
|
-
return FileData(
|
|
253
|
-
identifier=file.unique_id,
|
|
254
|
-
connector_type=CONNECTOR_TYPE,
|
|
255
|
-
source_identifiers=SourceIdentifiers(
|
|
256
|
-
filename=file.name,
|
|
257
|
-
fullpath=fullpath,
|
|
258
|
-
rel_path=rel_path,
|
|
259
|
-
),
|
|
260
|
-
metadata=FileDataSourceMetadata(
|
|
261
|
-
url=absolute_url,
|
|
262
|
-
version=f"{file.major_version}.{file.minor_version}",
|
|
263
|
-
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
264
|
-
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
265
|
-
date_processed=str(time()),
|
|
266
|
-
record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
|
|
267
|
-
),
|
|
268
|
-
additional_metadata=additional_metadata,
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
def get_root(self, client: "ClientContext") -> "Folder":
|
|
272
|
-
if path := self.index_config.path:
|
|
273
|
-
return client.web.get_folder_by_server_relative_path(path)
|
|
274
|
-
default_document_library = client.web.default_document_library()
|
|
275
|
-
root_folder = default_document_library.root_folder
|
|
276
|
-
root_folder = root_folder.get().execute_query()
|
|
277
|
-
self.index_config.path = root_folder.name
|
|
278
|
-
return root_folder
|
|
279
|
-
|
|
280
|
-
def get_site_url(self, client: "ClientContext") -> str:
|
|
281
|
-
res = client.web.get().execute_query()
|
|
282
|
-
return res.url
|
|
283
|
-
|
|
284
|
-
def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
|
|
285
|
-
return permissions_client.sites.get_by_url(url=site_url).execute_query()
|
|
286
|
-
|
|
287
|
-
def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
|
|
288
|
-
# TODO find a way to narrow this search down by name of drive
|
|
289
|
-
items: list["DriveItem"] = []
|
|
290
|
-
drives: list["Drive"] = site.drives.get_all().execute_query()
|
|
291
|
-
for drive in drives:
|
|
292
|
-
items.extend(drive.root.children.get_all().execute_query())
|
|
293
|
-
return items
|
|
58
|
+
index_config: SharepointIndexerConfig
|
|
59
|
+
connector_type: str = CONNECTOR_TYPE
|
|
294
60
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
"granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
|
|
306
|
-
"invitation": permission.invitation.to_json(),
|
|
307
|
-
}
|
|
61
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
62
|
+
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
|
|
63
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
64
|
+
|
|
65
|
+
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
66
|
+
if "error" in token_resp:
|
|
67
|
+
raise SourceConnectionError(
|
|
68
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
69
|
+
f"({token_resp.get('error_description')})"
|
|
70
|
+
)
|
|
308
71
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
existing_items = self.get_permissions_items(site=site)
|
|
316
|
-
for file_data in all_file_data:
|
|
317
|
-
etag = file_data.additional_metadata.get("ETag")
|
|
318
|
-
if not etag:
|
|
319
|
-
continue
|
|
320
|
-
matching_items = list(filter(lambda x: x.etag == etag, existing_items))
|
|
321
|
-
if not matching_items:
|
|
322
|
-
continue
|
|
323
|
-
if len(matching_items) > 1:
|
|
324
|
-
logger.warning(
|
|
325
|
-
"Found multiple drive items with etag matching {}, skipping: {}".format(
|
|
326
|
-
etag, ", ".join([i.name for i in matching_items])
|
|
327
|
-
)
|
|
328
|
-
)
|
|
329
|
-
continue
|
|
330
|
-
matching_item = matching_items[0]
|
|
331
|
-
permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
|
|
332
|
-
permissions_data = [
|
|
333
|
-
self.map_permission(permission=permission) for permission in permissions
|
|
334
|
-
]
|
|
335
|
-
file_data.metadata.permissions_data = permissions_data
|
|
72
|
+
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
73
|
+
try:
|
|
74
|
+
site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
75
|
+
site_drive_item = site.drive.get().execute_query().root
|
|
76
|
+
except ClientRequestException:
|
|
77
|
+
logger.info("Site not found")
|
|
336
78
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
return (
|
|
340
|
-
self.connection_config.permissions_config is not None
|
|
341
|
-
and self.connection_config.permissions_config.permissions_tenant
|
|
342
|
-
and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
|
|
343
|
-
and self.connection_config.permissions_config.permissions_application_id
|
|
79
|
+
drive_items = await self.list_objects(
|
|
80
|
+
folder=site_drive_item, recursive=self.index_config.recursive
|
|
344
81
|
)
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
root_folder = self.get_root(client=client)
|
|
349
|
-
logger.debug(f"processing content from path: {self.index_config.path}")
|
|
350
|
-
if not self.index_config.omit_files:
|
|
351
|
-
files = self.list_files(root_folder, recursive=self.index_config.recursive)
|
|
352
|
-
file_data = [self.file_to_file_data(file=file, client=client) for file in files]
|
|
353
|
-
if self.process_permissions:
|
|
354
|
-
self.enrich_permissions_on_files(
|
|
355
|
-
all_file_data=file_data, site_url=self.get_site_url(client=client)
|
|
356
|
-
)
|
|
357
|
-
for file in file_data:
|
|
358
|
-
yield file
|
|
359
|
-
if not self.index_config.omit_pages:
|
|
360
|
-
pages = self.list_pages(client=client)
|
|
361
|
-
for page in pages:
|
|
362
|
-
file_data = self.page_to_file_data(site_page=page)
|
|
363
|
-
file_data.metadata.record_locator["site_url"] = client.base_url
|
|
364
|
-
yield file_data
|
|
82
|
+
for drive_item in drive_items:
|
|
83
|
+
file_data = await self.drive_item_to_file_data(drive_item=drive_item)
|
|
84
|
+
yield file_data
|
|
365
85
|
|
|
366
86
|
|
|
367
|
-
class SharepointDownloaderConfig(
|
|
87
|
+
class SharepointDownloaderConfig(OnedriveDownloaderConfig):
|
|
368
88
|
pass
|
|
369
89
|
|
|
370
90
|
|
|
371
91
|
@dataclass
|
|
372
|
-
class SharepointDownloader(
|
|
92
|
+
class SharepointDownloader(OnedriveDownloader):
|
|
373
93
|
connection_config: SharepointConnectionConfig
|
|
374
94
|
download_config: SharepointDownloaderConfig
|
|
375
95
|
connector_type: str = CONNECTOR_TYPE
|
|
376
96
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
if content_type == SharepointContentType.SITEPAGE.value:
|
|
382
|
-
# Update output extension to html if site page
|
|
383
|
-
download_path = download_path.with_suffix(".html")
|
|
384
|
-
return download_path
|
|
385
|
-
|
|
386
|
-
def get_document(self, file_data: FileData) -> DownloadResponse:
|
|
387
|
-
client: "ClientContext" = self.connection_config.get_client()
|
|
388
|
-
file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
|
|
389
|
-
download_path = self.get_download_path(file_data=file_data)
|
|
390
|
-
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
391
|
-
logger.debug(
|
|
392
|
-
f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
|
|
393
|
-
)
|
|
394
|
-
with download_path.open("wb") as f:
|
|
395
|
-
file.download(f).execute_query()
|
|
396
|
-
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
397
|
-
|
|
398
|
-
def get_site_page(self, file_data: FileData) -> DownloadResponse:
|
|
399
|
-
# TODO fetch comments for site page as well
|
|
400
|
-
from lxml import etree, html
|
|
401
|
-
|
|
402
|
-
canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
|
|
403
|
-
layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
|
|
404
|
-
html_content = []
|
|
405
|
-
if layout_web_parts_content_raw:
|
|
406
|
-
layout_web_parts_content = json.loads(layout_web_parts_content_raw)
|
|
407
|
-
for web_part in layout_web_parts_content:
|
|
408
|
-
properties = web_part.get("properties", {})
|
|
409
|
-
if title := properties.get("title"):
|
|
410
|
-
html_content.append(f"<title>{title}</title>")
|
|
411
|
-
if canvas_content_raw:
|
|
412
|
-
canvas_content = json.loads(canvas_content_raw)
|
|
413
|
-
for content in canvas_content:
|
|
414
|
-
if inner_html := content.get("innerHTML"):
|
|
415
|
-
html_content.append(inner_html)
|
|
416
|
-
htmls = "".join(html_content)
|
|
417
|
-
content = f"<div>{htmls}</div>"
|
|
418
|
-
document = html.fromstring(content)
|
|
419
|
-
download_path = self.get_download_path(file_data=file_data)
|
|
420
|
-
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
421
|
-
logger.debug(
|
|
422
|
-
f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
|
|
423
|
-
)
|
|
424
|
-
with download_path.open("w") as f:
|
|
425
|
-
f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
|
|
426
|
-
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
97
|
+
@SourceConnectionNetworkError.wrap
|
|
98
|
+
@requires_dependencies(["office365"], extras="onedrive")
|
|
99
|
+
def _fetch_file(self, file_data: FileData) -> DriveItem:
|
|
100
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
427
101
|
|
|
428
|
-
|
|
429
|
-
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
430
|
-
if not content_type:
|
|
102
|
+
if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
|
|
431
103
|
raise ValueError(
|
|
432
|
-
f"
|
|
104
|
+
f"file data doesn't have enough information to get "
|
|
105
|
+
f"file content: {file_data.model_dump()}"
|
|
433
106
|
)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
107
|
+
|
|
108
|
+
server_relative_path = file_data.source_identifiers.fullpath
|
|
109
|
+
client = self.connection_config.get_client()
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
113
|
+
site_drive_item = site.drive.get().execute_query().root
|
|
114
|
+
except ClientRequestException:
|
|
115
|
+
logger.info("Site not found")
|
|
116
|
+
file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
|
|
117
|
+
|
|
118
|
+
if not file:
|
|
119
|
+
raise FileNotFoundError(f"file not found: {server_relative_path}")
|
|
120
|
+
return file
|
|
440
121
|
|
|
441
122
|
|
|
442
123
|
sharepoint_source_entry = SourceRegistryEntry(
|
|
@@ -18,7 +18,7 @@ class EmbedderConfig(BaseModel):
|
|
|
18
18
|
"openai",
|
|
19
19
|
"azure-openai",
|
|
20
20
|
"huggingface",
|
|
21
|
-
"
|
|
21
|
+
"bedrock",
|
|
22
22
|
"vertexai",
|
|
23
23
|
"voyageai",
|
|
24
24
|
"octoai",
|
|
@@ -162,7 +162,7 @@ class EmbedderConfig(BaseModel):
|
|
|
162
162
|
if self.embedding_provider == "octoai":
|
|
163
163
|
return self.get_octoai_embedder(embedding_kwargs=kwargs)
|
|
164
164
|
|
|
165
|
-
if self.embedding_provider == "
|
|
165
|
+
if self.embedding_provider == "bedrock":
|
|
166
166
|
return self.get_bedrock_embedder()
|
|
167
167
|
|
|
168
168
|
if self.embedding_provider == "vertexai":
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from abc import ABC
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field, SecretStr
|
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import UserError
|
|
10
12
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
13
|
from unstructured_ingest.v2.logger import logger
|
|
12
14
|
from unstructured_ingest.v2.unstructured_api import call_api_async
|
|
@@ -73,6 +75,9 @@ class PartitionerConfig(BaseModel):
|
|
|
73
75
|
hi_res_model_name: Optional[str] = Field(
|
|
74
76
|
default=None, description="Model name for hi-res strategy."
|
|
75
77
|
)
|
|
78
|
+
raise_unsupported_filetype: bool = Field(
|
|
79
|
+
default=False, description="Raise an error if the file type is not supported"
|
|
80
|
+
)
|
|
76
81
|
|
|
77
82
|
def model_post_init(self, __context: Any) -> None:
|
|
78
83
|
if self.metadata_exclude and self.metadata_include:
|
|
@@ -151,13 +156,25 @@ class Partitioner(BaseProcess, ABC):
|
|
|
151
156
|
class FileDataSourceMetadata(DataSourceMetadata):
|
|
152
157
|
filesize_bytes: Optional[int] = None
|
|
153
158
|
|
|
159
|
+
metadata = metadata or {}
|
|
154
160
|
logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
|
|
155
161
|
logger.debug(f"partitioning file {filename} with metadata {metadata}")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
162
|
+
try:
|
|
163
|
+
elements = partition(
|
|
164
|
+
filename=str(filename.resolve()),
|
|
165
|
+
data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
|
|
166
|
+
**self.config.to_partition_kwargs(),
|
|
167
|
+
)
|
|
168
|
+
except ValueError as sdk_error:
|
|
169
|
+
if (
|
|
170
|
+
self.is_unstructured_error_unsupported_filetype(sdk_error=sdk_error)
|
|
171
|
+
and not self.config.raise_unsupported_filetype
|
|
172
|
+
):
|
|
173
|
+
logger.warning(
|
|
174
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
175
|
+
)
|
|
176
|
+
return []
|
|
177
|
+
raise sdk_error
|
|
161
178
|
return self.postprocess(elements=elements_to_dicts(elements))
|
|
162
179
|
|
|
163
180
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
@@ -179,10 +196,37 @@ class Partitioner(BaseProcess, ABC):
|
|
|
179
196
|
element["metadata"]["data_source"] = metadata
|
|
180
197
|
return self.postprocess(elements=elements)
|
|
181
198
|
|
|
199
|
+
def is_unstructured_error_unsupported_filetype(self, sdk_error: ValueError) -> bool:
|
|
200
|
+
error_msg = sdk_error.args[0]
|
|
201
|
+
return (
|
|
202
|
+
"Invalid file" in error_msg
|
|
203
|
+
or "Unstructured schema" in error_msg
|
|
204
|
+
or "fast strategy is not available for image files" in error_msg
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
|
|
208
|
+
error_msg = error.args[0]
|
|
209
|
+
error_dict = json.loads(error_msg)
|
|
210
|
+
details = error_dict["detail"]
|
|
211
|
+
return "fast strategy is not available for image files" in details or (
|
|
212
|
+
"file type" in details.lower() and "is not supported" in details.lower()
|
|
213
|
+
)
|
|
214
|
+
|
|
182
215
|
def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
|
|
183
216
|
return self.partition_locally(filename, metadata=metadata, **kwargs)
|
|
184
217
|
|
|
185
218
|
async def run_async(
|
|
186
219
|
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
187
220
|
) -> list[dict]:
|
|
188
|
-
|
|
221
|
+
try:
|
|
222
|
+
return await self.partition_via_api(filename, metadata=metadata, **kwargs)
|
|
223
|
+
except UserError as user_error:
|
|
224
|
+
if (
|
|
225
|
+
self.is_client_error_unsupported_filetype(error=user_error)
|
|
226
|
+
and not self.config.raise_unsupported_filetype
|
|
227
|
+
):
|
|
228
|
+
logger.warning(
|
|
229
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
230
|
+
)
|
|
231
|
+
return []
|
|
232
|
+
raise user_error
|