unstructured-ingest 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -18,9 +18,6 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
18
18
 
19
19
 
20
20
  @pytest.fixture
21
- @pytest.mark.xfail(
22
- reason="Issues with test setup on the provider side."
23
- ) # TODO: remove line when issues are addressed
24
21
  def onedrive_test_folder() -> str:
25
22
  """
26
23
  Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
@@ -67,9 +64,6 @@ def get_connection_config():
67
64
 
68
65
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
69
66
  @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
70
- @pytest.mark.xfail(
71
- reason="Issues with test setup on the provider side."
72
- ) # TODO: remove line when issues are addressed
73
67
  def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
74
68
  """
75
69
  Integration test for the OneDrive destination connector.
@@ -107,10 +101,14 @@ def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
107
101
  client = connection_config.get_client()
108
102
  drive = client.users[user_pname].drive
109
103
 
104
+ # Workaround: File should not have .json in the metadata.filename it comes from embedder
110
105
  uploaded_file = (
111
- drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
106
+ drive.root.get_by_path(f"{destination_fullpath}.json")
107
+ .select(["id", "name"])
108
+ .get()
109
+ .execute_query()
112
110
  )
113
111
 
114
112
  # Check if the file exists
115
113
  assert uploaded_file is not None
116
- assert uploaded_file.name == upload_file.name
114
+ assert uploaded_file.name == f"{upload_file.name}.json"
@@ -1 +1 @@
1
- __version__ = "0.4.6" # pragma: no cover
1
+ __version__ = "0.4.7" # pragma: no cover
@@ -3,11 +3,15 @@ from typing import TYPE_CHECKING
3
3
 
4
4
  from pydantic import Field
5
5
 
6
- from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
6
+ from unstructured_ingest.embed.openai import (
7
+ AsyncOpenAIEmbeddingEncoder,
8
+ OpenAIEmbeddingConfig,
9
+ OpenAIEmbeddingEncoder,
10
+ )
7
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
12
 
9
13
  if TYPE_CHECKING:
10
- from openai import AzureOpenAI
14
+ from openai import AsyncAzureOpenAI, AzureOpenAI
11
15
 
12
16
 
13
17
  class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
@@ -25,7 +29,22 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
25
29
  azure_endpoint=self.azure_endpoint,
26
30
  )
27
31
 
32
+ @requires_dependencies(["openai"], extras="openai")
33
+ def get_async_client(self) -> "AsyncAzureOpenAI":
34
+ from openai import AsyncAzureOpenAI
35
+
36
+ return AsyncAzureOpenAI(
37
+ api_key=self.api_key.get_secret_value(),
38
+ api_version=self.api_version,
39
+ azure_endpoint=self.azure_endpoint,
40
+ )
41
+
28
42
 
29
43
  @dataclass
30
44
  class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
31
45
  config: AzureOpenAIEmbeddingConfig
46
+
47
+
48
+ @dataclass
49
+ class AsyncAzureOpenAIEmbeddingEncoder(AsyncOpenAIEmbeddingEncoder):
50
+ config: AzureOpenAIEmbeddingConfig
@@ -329,9 +329,9 @@ class Pipeline:
329
329
  source_entry = {
330
330
  k: v
331
331
  for k, v in source_registry.items()
332
- if isinstance(indexer_config, v.indexer_config)
333
- and isinstance(downloader_config, v.downloader_config)
334
- and isinstance(source_connection_config, v.connection_config)
332
+ if type(indexer_config) is v.indexer_config
333
+ and type(downloader_config) is v.downloader_config
334
+ and type(source_connection_config) is v.connection_config
335
335
  }
336
336
  if len(source_entry) > 1:
337
337
  raise ValueError(
@@ -0,0 +1,23 @@
1
+ {
2
+ "properties": [
3
+ {
4
+ "dataType": [
5
+ "text"
6
+ ],
7
+ "indexFilterable": true,
8
+ "indexSearchable": true,
9
+ "name": "record_id",
10
+ "tokenization": "word"
11
+ },
12
+ {
13
+ "dataType": [
14
+ "text"
15
+ ],
16
+ "indexFilterable": true,
17
+ "indexSearchable": true,
18
+ "name": "text",
19
+ "tokenization": "word"
20
+ }
21
+ ],
22
+ "vectorizer": "none"
23
+ }
@@ -1,85 +1,43 @@
1
- import json
2
- from dataclasses import dataclass, field
3
- from enum import Enum
4
- from pathlib import Path
5
- from time import time
6
- from typing import TYPE_CHECKING, Any, Generator, Optional
7
- from urllib.parse import quote
1
+ from __future__ import annotations
8
2
 
9
- from pydantic import BaseModel, Field, Secret, SecretStr
3
+ import asyncio
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING, Any, AsyncIterator
10
6
 
11
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
7
+ from pydantic import Field
8
+
9
+ from unstructured_ingest.error import (
10
+ SourceConnectionError,
11
+ SourceConnectionNetworkError,
12
+ )
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.interfaces import (
14
- AccessConfig,
15
- ConnectionConfig,
16
- Downloader,
17
- DownloaderConfig,
18
- DownloadResponse,
19
15
  FileData,
20
- FileDataSourceMetadata,
21
- Indexer,
22
- IndexerConfig,
23
- SourceIdentifiers,
24
16
  )
25
17
  from unstructured_ingest.v2.logger import logger
26
18
  from unstructured_ingest.v2.processes.connector_registry import (
27
19
  SourceRegistryEntry,
28
20
  )
29
-
30
- from .utils import parse_datetime
21
+ from unstructured_ingest.v2.processes.connectors.onedrive import (
22
+ OnedriveAccessConfig,
23
+ OnedriveConnectionConfig,
24
+ OnedriveDownloader,
25
+ OnedriveDownloaderConfig,
26
+ OnedriveIndexer,
27
+ OnedriveIndexerConfig,
28
+ )
31
29
 
32
30
  if TYPE_CHECKING:
33
- from office365.graph_client import GraphClient
34
31
  from office365.onedrive.driveitems.driveItem import DriveItem
35
- from office365.onedrive.drives.drive import Drive
36
- from office365.onedrive.permissions.permission import Permission
37
- from office365.onedrive.sites.site import Site
38
- from office365.sharepoint.client_context import ClientContext
39
- from office365.sharepoint.files.file import File
40
- from office365.sharepoint.folders.folder import Folder
41
- from office365.sharepoint.publishing.pages.page import SitePage
42
32
 
43
33
  CONNECTOR_TYPE = "sharepoint"
44
34
 
45
- MAX_MB_SIZE = 512_000_000
46
-
47
- # TODO handle other data types possible from Sharepoint
48
- # exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
49
-
50
-
51
- class SharepointContentType(Enum):
52
- DOCUMENT = "document"
53
- SITEPAGE = "site_page"
54
- LIST = "list"
55
-
56
35
 
57
- class SharepointAccessConfig(AccessConfig):
58
- client_cred: str = Field(description="Sharepoint app secret")
36
+ class SharepointAccessConfig(OnedriveAccessConfig):
37
+ client_cred: str = Field(description="Microsoft App client secret")
59
38
 
60
39
 
61
- class SharepointPermissionsConfig(BaseModel):
62
- permissions_application_id: Optional[str] = Field(
63
- default=None, description="Microsoft Graph API application id"
64
- )
65
- permissions_tenant: Optional[str] = Field(
66
- default=None,
67
- description="url to get permissions data within tenant.",
68
- examples=["https://contoso.onmicrosoft.com"],
69
- )
70
- permissions_client_cred: Optional[SecretStr] = Field(
71
- default=None, description="Microsoft Graph API application credentials"
72
- )
73
- authority_url: Optional[SecretStr] = Field(
74
- repr=False,
75
- default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
76
- description="Permissions authority url",
77
- examples=["https://login.microsoftonline.com"],
78
- )
79
-
80
-
81
- class SharepointConnectionConfig(ConnectionConfig):
82
- client_id: str = Field(description="Sharepoint app client ID")
40
+ class SharepointConnectionConfig(OnedriveConnectionConfig):
83
41
  site: str = Field(
84
42
  description="Sharepoint site url. Process either base url e.g \
85
43
  https://[tenant].sharepoint.com or relative sites \
@@ -88,355 +46,75 @@ class SharepointConnectionConfig(ConnectionConfig):
88
46
  https://[tenant]-admin.sharepoint.com.\
89
47
  This requires the app to be registered at a tenant level"
90
48
  )
91
- access_config: Secret[SharepointAccessConfig]
92
- permissions_config: Optional[SharepointPermissionsConfig] = None
93
49
 
94
- @requires_dependencies(["office365"], extras="sharepoint")
95
- def get_client(self) -> "ClientContext":
96
- from office365.runtime.auth.client_credential import ClientCredential
97
- from office365.sharepoint.client_context import ClientContext
98
50
 
99
- try:
100
- credentials = ClientCredential(
101
- self.client_id, self.access_config.get_secret_value().client_cred
102
- )
103
- site_client = ClientContext(self.site).with_credentials(credentials)
104
- except Exception as e:
105
- logger.error(f"Couldn't set Sharepoint client: {e}")
106
- raise e
107
- return site_client
108
-
109
- @requires_dependencies(["msal"], extras="sharepoint")
110
- def get_permissions_token(self):
111
- from msal import ConfidentialClientApplication
112
-
113
- try:
114
- client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
115
- app = ConfidentialClientApplication(
116
- authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
117
- f"{self.permissions_config.permissions_tenant}",
118
- client_id=self.permissions_config.permissions_application_id,
119
- client_credential=client_credential,
120
- )
121
- token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
122
- except ValueError as exc:
123
- logger.error("Couldn't set up credentials for Sharepoint")
124
- raise exc
125
- if "error" in token:
126
- raise SourceConnectionNetworkError(
127
- "failed to fetch token, {}: {}".format(token["error"], token["error_description"])
128
- )
129
- return token
130
-
131
- @requires_dependencies(["office365"], extras="sharepoint")
132
- def get_permissions_client(self) -> Optional["GraphClient"]:
133
- from office365.graph_client import GraphClient
134
-
135
- if self.permissions_config is None:
136
- return None
137
-
138
- client = GraphClient(self.get_permissions_token)
139
- return client
140
-
141
-
142
- class SharepointIndexerConfig(IndexerConfig):
143
- path: Optional[str] = Field(
144
- default=None,
145
- description="Path from which to start parsing files. If the connector is to \
146
- process all sites within the tenant this filter will be applied to \
147
- all sites document libraries.",
148
- )
149
- recursive: bool = Field(
150
- default=False,
151
- description="Recursively download files in their respective folders "
152
- "otherwise stop at the files in provided folder level.",
153
- )
154
- omit_files: bool = Field(default=False, description="Don't process files.")
155
- omit_pages: bool = Field(default=False, description="Don't process site pages.")
156
- omit_lists: bool = Field(default=False, description="Don't process lists.")
51
+ class SharepointIndexerConfig(OnedriveIndexerConfig):
52
+ pass
157
53
 
158
54
 
159
55
  @dataclass
160
- class SharepointIndexer(Indexer):
56
+ class SharepointIndexer(OnedriveIndexer):
161
57
  connection_config: SharepointConnectionConfig
162
- index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
58
+ index_config: SharepointIndexerConfig
163
59
 
164
- def precheck(self) -> None:
165
- try:
166
- site_client = self.connection_config.get_client()
167
- site_client.site_pages.pages.get().execute_query()
168
- except Exception as e:
169
- logger.error(f"failed to validate connection: {e}", exc_info=True)
170
- raise SourceConnectionError(f"failed to validate connection: {e}")
171
-
172
- def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
173
- if not recursive:
174
- folder.expand(["Files"]).get().execute_query()
175
- return folder.files
176
-
177
- folder.expand(["Files", "Folders"]).get().execute_query()
178
- files: list["File"] = list(folder.files)
179
- folders: list["Folder"] = list(folder.folders)
180
- for f in folders:
181
- if "/Forms" in f.serverRelativeUrl:
182
- continue
183
- files.extend(self.list_files(f, recursive))
184
- return files
185
-
186
- def get_properties(self, raw_properties: dict) -> dict:
187
- raw_properties = {k: v for k, v in raw_properties.items() if v}
188
- filtered_properties = {}
189
- for k, v in raw_properties.items():
190
- try:
191
- json.dumps(v)
192
- filtered_properties[k] = v
193
- except TypeError:
194
- pass
195
- return filtered_properties
196
-
197
- def list_pages(self, client: "ClientContext") -> list["SitePage"]:
198
- pages = client.site_pages.pages.get().execute_query()
199
- return pages
200
-
201
- def page_to_file_data(self, site_page: "SitePage") -> FileData:
202
- site_page.expand(site_page.properties.keys()).get().execute_query()
203
- version = site_page.properties.get("Version", None)
204
- unique_id = site_page.properties.get("UniqueId", None)
205
- modified_date = site_page.properties.get("Modified", None)
206
- url = site_page.properties.get("AbsoluteUrl", None)
207
- date_modified_dt = parse_datetime(modified_date) if modified_date else None
208
- date_created_at = (
209
- parse_datetime(site_page.first_published)
210
- if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
211
- else None
212
- )
213
- file_path = site_page.get_property("Url", "")
214
- server_path = file_path if file_path[0] != "/" else file_path[1:]
215
- additional_metadata = self.get_properties(raw_properties=site_page.properties)
216
- additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
217
- return FileData(
218
- identifier=unique_id,
219
- connector_type=CONNECTOR_TYPE,
220
- source_identifiers=SourceIdentifiers(
221
- filename=site_page.file_name,
222
- fullpath=file_path,
223
- rel_path=file_path.replace(self.index_config.path, ""),
224
- ),
225
- metadata=FileDataSourceMetadata(
226
- url=url,
227
- version=version,
228
- date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
229
- date_created=str(date_created_at.timestamp()) if date_created_at else None,
230
- date_processed=str(time()),
231
- record_locator={
232
- "server_path": server_path,
233
- },
234
- ),
235
- additional_metadata=additional_metadata,
236
- )
237
-
238
- def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
239
- file.expand(file.properties.keys()).get().execute_query()
240
- absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
241
- date_modified_dt = (
242
- parse_datetime(file.time_last_modified) if file.time_last_modified else None
243
- )
244
-
245
- date_created_at = parse_datetime(file.time_created) if file.time_created else None
246
- additional_metadata = self.get_properties(raw_properties=file.properties)
247
- additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
248
- fullpath = str(file.serverRelativeUrl)
249
- rel_path = fullpath.replace(self.index_config.path, "")
250
- while rel_path[0] == "/":
251
- rel_path = rel_path[1:]
252
- return FileData(
253
- identifier=file.unique_id,
254
- connector_type=CONNECTOR_TYPE,
255
- source_identifiers=SourceIdentifiers(
256
- filename=file.name,
257
- fullpath=fullpath,
258
- rel_path=rel_path,
259
- ),
260
- metadata=FileDataSourceMetadata(
261
- url=absolute_url,
262
- version=f"{file.major_version}.{file.minor_version}",
263
- date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
264
- date_created=str(date_created_at.timestamp()) if date_created_at else None,
265
- date_processed=str(time()),
266
- record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
267
- ),
268
- additional_metadata=additional_metadata,
269
- )
270
-
271
- def get_root(self, client: "ClientContext") -> "Folder":
272
- if path := self.index_config.path:
273
- return client.web.get_folder_by_server_relative_path(path)
274
- default_document_library = client.web.default_document_library()
275
- root_folder = default_document_library.root_folder
276
- root_folder = root_folder.get().execute_query()
277
- self.index_config.path = root_folder.name
278
- return root_folder
279
-
280
- def get_site_url(self, client: "ClientContext") -> str:
281
- res = client.web.get().execute_query()
282
- return res.url
283
-
284
- def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
285
- return permissions_client.sites.get_by_url(url=site_url).execute_query()
286
-
287
- def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
288
- # TODO find a way to narrow this search down by name of drive
289
- items: list["DriveItem"] = []
290
- drives: list["Drive"] = site.drives.get_all().execute_query()
291
- for drive in drives:
292
- items.extend(drive.root.children.get_all().execute_query())
293
- return items
60
+ @requires_dependencies(["office365"], extras="sharepoint")
61
+ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
62
+ from office365.runtime.client_request_exception import ClientRequestException
294
63
 
295
- def map_permission(self, permission: "Permission") -> dict:
296
- return {
297
- "id": permission.id,
298
- "roles": list(permission.roles),
299
- "share_id": permission.share_id,
300
- "has_password": permission.has_password,
301
- "link": permission.link.to_json(),
302
- "granted_to_identities": permission.granted_to_identities.to_json(),
303
- "granted_to": permission.granted_to.to_json(),
304
- "granted_to_v2": permission.granted_to_v2.to_json(),
305
- "granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
306
- "invitation": permission.invitation.to_json(),
307
- }
64
+ token_resp = await asyncio.to_thread(self.connection_config.get_token)
65
+ if "error" in token_resp:
66
+ raise SourceConnectionError(
67
+ f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
68
+ )
308
69
 
309
- def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None:
310
- logger.debug("Enriching permissions on files")
311
- permission_client = self.connection_config.get_permissions_client()
312
- if permission_client is None:
313
- return
314
- site = self.get_site(permissions_client=permission_client, site_url=site_url)
315
- existing_items = self.get_permissions_items(site=site)
316
- for file_data in all_file_data:
317
- etag = file_data.additional_metadata.get("ETag")
318
- if not etag:
319
- continue
320
- matching_items = list(filter(lambda x: x.etag == etag, existing_items))
321
- if not matching_items:
322
- continue
323
- if len(matching_items) > 1:
324
- logger.warning(
325
- "Found multiple drive items with etag matching {}, skipping: {}".format(
326
- etag, ", ".join([i.name for i in matching_items])
327
- )
328
- )
329
- continue
330
- matching_item = matching_items[0]
331
- permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
332
- permissions_data = [
333
- self.map_permission(permission=permission) for permission in permissions
334
- ]
335
- file_data.metadata.permissions_data = permissions_data
70
+ client = await asyncio.to_thread(self.connection_config.get_client)
71
+ try:
72
+ site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
73
+ site_drive_item = site.drive.get().execute_query().root
74
+ except ClientRequestException:
75
+ logger.info("Site not found")
336
76
 
337
- @property
338
- def process_permissions(self) -> bool:
339
- return (
340
- self.connection_config.permissions_config is not None
341
- and self.connection_config.permissions_config.permissions_tenant
342
- and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
343
- and self.connection_config.permissions_config.permissions_application_id
77
+ drive_items = await self.list_objects(
78
+ folder=site_drive_item, recursive=self.index_config.recursive
344
79
  )
345
-
346
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
347
- client = self.connection_config.get_client()
348
- root_folder = self.get_root(client=client)
349
- logger.debug(f"processing content from path: {self.index_config.path}")
350
- if not self.index_config.omit_files:
351
- files = self.list_files(root_folder, recursive=self.index_config.recursive)
352
- file_data = [self.file_to_file_data(file=file, client=client) for file in files]
353
- if self.process_permissions:
354
- self.enrich_permissions_on_files(
355
- all_file_data=file_data, site_url=self.get_site_url(client=client)
356
- )
357
- for file in file_data:
358
- yield file
359
- if not self.index_config.omit_pages:
360
- pages = self.list_pages(client=client)
361
- for page in pages:
362
- file_data = self.page_to_file_data(site_page=page)
363
- file_data.metadata.record_locator["site_url"] = client.base_url
364
- yield file_data
80
+ for drive_item in drive_items:
81
+ file_data = await self.drive_item_to_file_data(drive_item=drive_item)
82
+ yield file_data
365
83
 
366
84
 
367
- class SharepointDownloaderConfig(DownloaderConfig):
85
+ class SharepointDownloaderConfig(OnedriveDownloaderConfig):
368
86
  pass
369
87
 
370
88
 
371
89
  @dataclass
372
- class SharepointDownloader(Downloader):
90
+ class SharepointDownloader(OnedriveDownloader):
373
91
  connection_config: SharepointConnectionConfig
374
92
  download_config: SharepointDownloaderConfig
375
- connector_type: str = CONNECTOR_TYPE
376
-
377
- def get_download_path(self, file_data: FileData) -> Path:
378
- download_path = super().get_download_path(file_data=file_data)
379
93
 
380
- content_type = file_data.additional_metadata.get("sharepoint_content_type")
381
- if content_type == SharepointContentType.SITEPAGE.value:
382
- # Update output extension to html if site page
383
- download_path = download_path.with_suffix(".html")
384
- return download_path
385
-
386
- def get_document(self, file_data: FileData) -> DownloadResponse:
387
- client: "ClientContext" = self.connection_config.get_client()
388
- file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
389
- download_path = self.get_download_path(file_data=file_data)
390
- download_path.parent.mkdir(parents=True, exist_ok=True)
391
- logger.debug(
392
- f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
393
- )
394
- with download_path.open("wb") as f:
395
- file.download(f).execute_query()
396
- return self.generate_download_response(file_data=file_data, download_path=download_path)
94
+ @SourceConnectionNetworkError.wrap
95
+ @requires_dependencies(["office365"], extras="onedrive")
96
+ def _fetch_file(self, file_data: FileData) -> DriveItem:
97
+ from office365.runtime.client_request_exception import ClientRequestException
397
98
 
398
- def get_site_page(self, file_data: FileData) -> DownloadResponse:
399
- # TODO fetch comments for site page as well
400
- from lxml import etree, html
401
-
402
- canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
403
- layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
404
- html_content = []
405
- if layout_web_parts_content_raw:
406
- layout_web_parts_content = json.loads(layout_web_parts_content_raw)
407
- for web_part in layout_web_parts_content:
408
- properties = web_part.get("properties", {})
409
- if title := properties.get("title"):
410
- html_content.append(f"<title>{title}</title>")
411
- if canvas_content_raw:
412
- canvas_content = json.loads(canvas_content_raw)
413
- for content in canvas_content:
414
- if inner_html := content.get("innerHTML"):
415
- html_content.append(inner_html)
416
- htmls = "".join(html_content)
417
- content = f"<div>{htmls}</div>"
418
- document = html.fromstring(content)
419
- download_path = self.get_download_path(file_data=file_data)
420
- download_path.parent.mkdir(parents=True, exist_ok=True)
421
- logger.debug(
422
- f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
423
- )
424
- with download_path.open("w") as f:
425
- f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
426
- return self.generate_download_response(file_data=file_data, download_path=download_path)
427
-
428
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
429
- content_type = file_data.additional_metadata.get("sharepoint_content_type")
430
- if not content_type:
99
+ if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
431
100
  raise ValueError(
432
- f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}"
101
+ f"file data doesn't have enough information to get "
102
+ f"file content: {file_data.model_dump()}"
433
103
  )
434
- if content_type == SharepointContentType.DOCUMENT.value:
435
- return self.get_document(file_data=file_data)
436
- elif content_type == SharepointContentType.SITEPAGE.value:
437
- return self.get_site_page(file_data=file_data)
438
- else:
439
- raise ValueError(f"content type not recognized: {content_type}")
104
+
105
+ server_relative_path = file_data.source_identifiers.fullpath
106
+ client = self.connection_config.get_client()
107
+
108
+ try:
109
+ site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
110
+ site_drive_item = site.drive.get().execute_query().root
111
+ except ClientRequestException:
112
+ logger.info("Site not found")
113
+ file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
114
+
115
+ if not file:
116
+ raise FileNotFoundError(f"file not found: {server_relative_path}")
117
+ return file
440
118
 
441
119
 
442
120
  sharepoint_source_entry = SourceRegistryEntry(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.4.6
3
+ Version: 0.4.7
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
+ Requires-Dist: pandas
25
26
  Requires-Dist: pydantic>=2.7
26
27
  Requires-Dist: dataclasses-json
27
28
  Requires-Dist: python-dateutil
28
- Requires-Dist: click
29
- Requires-Dist: tqdm
30
- Requires-Dist: pandas
31
29
  Requires-Dist: opentelemetry-sdk
30
+ Requires-Dist: tqdm
31
+ Requires-Dist: click
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -39,21 +39,21 @@ Requires-Dist: adlfs; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
- Requires-Dist: aioboto3; extra == "bedrock"
43
42
  Requires-Dist: boto3; extra == "bedrock"
43
+ Requires-Dist: aioboto3; extra == "bedrock"
44
44
  Provides-Extra: biomed
45
- Requires-Dist: bs4; extra == "biomed"
46
45
  Requires-Dist: requests; extra == "biomed"
46
+ Requires-Dist: bs4; extra == "biomed"
47
47
  Provides-Extra: box
48
- Requires-Dist: boxfs; extra == "box"
49
48
  Requires-Dist: fsspec; extra == "box"
49
+ Requires-Dist: boxfs; extra == "box"
50
50
  Provides-Extra: chroma
51
51
  Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
53
53
  Requires-Dist: clarifai; extra == "clarifai"
54
54
  Provides-Extra: confluence
55
- Requires-Dist: atlassian-python-api; extra == "confluence"
56
55
  Requires-Dist: requests; extra == "confluence"
56
+ Requires-Dist: atlassian-python-api; extra == "confluence"
57
57
  Provides-Extra: couchbase
58
58
  Requires-Dist: couchbase; extra == "couchbase"
59
59
  Provides-Extra: csv
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
63
63
  Provides-Extra: databricks-volumes
64
64
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
65
65
  Provides-Extra: delta-table
66
- Requires-Dist: deltalake; extra == "delta-table"
67
66
  Requires-Dist: boto3; extra == "delta-table"
67
+ Requires-Dist: deltalake; extra == "delta-table"
68
68
  Provides-Extra: discord
69
69
  Requires-Dist: discord.py; extra == "discord"
70
70
  Provides-Extra: doc
@@ -92,12 +92,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
92
92
  Provides-Extra: epub
93
93
  Requires-Dist: unstructured[epub]; extra == "epub"
94
94
  Provides-Extra: gcs
95
+ Requires-Dist: fsspec; extra == "gcs"
95
96
  Requires-Dist: bs4; extra == "gcs"
96
97
  Requires-Dist: gcsfs; extra == "gcs"
97
- Requires-Dist: fsspec; extra == "gcs"
98
98
  Provides-Extra: github
99
- Requires-Dist: pygithub>1.58.0; extra == "github"
100
99
  Requires-Dist: requests; extra == "github"
100
+ Requires-Dist: pygithub>1.58.0; extra == "github"
101
101
  Provides-Extra: gitlab
102
102
  Requires-Dist: python-gitlab; extra == "gitlab"
103
103
  Provides-Extra: google-drive
@@ -122,20 +122,20 @@ Requires-Dist: pymongo; extra == "mongodb"
122
122
  Provides-Extra: msg
123
123
  Requires-Dist: unstructured[msg]; extra == "msg"
124
124
  Provides-Extra: neo4j
125
- Requires-Dist: neo4j; extra == "neo4j"
126
- Requires-Dist: cymple; extra == "neo4j"
127
125
  Requires-Dist: networkx; extra == "neo4j"
126
+ Requires-Dist: cymple; extra == "neo4j"
127
+ Requires-Dist: neo4j; extra == "neo4j"
128
128
  Provides-Extra: notion
129
- Requires-Dist: notion-client; extra == "notion"
130
- Requires-Dist: backoff; extra == "notion"
131
129
  Requires-Dist: httpx; extra == "notion"
132
130
  Requires-Dist: htmlBuilder; extra == "notion"
131
+ Requires-Dist: notion-client; extra == "notion"
132
+ Requires-Dist: backoff; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
136
- Requires-Dist: bs4; extra == "onedrive"
137
136
  Requires-Dist: msal; extra == "onedrive"
138
137
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
+ Requires-Dist: bs4; extra == "onedrive"
139
139
  Provides-Extra: openai
140
140
  Requires-Dist: tiktoken; extra == "openai"
141
141
  Requires-Dist: openai; extra == "openai"
@@ -169,8 +169,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
169
169
  Provides-Extra: rtf
170
170
  Requires-Dist: unstructured[rtf]; extra == "rtf"
171
171
  Provides-Extra: s3
172
- Requires-Dist: s3fs; extra == "s3"
173
172
  Requires-Dist: fsspec; extra == "s3"
173
+ Requires-Dist: s3fs; extra == "s3"
174
174
  Provides-Extra: salesforce
175
175
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
176
  Provides-Extra: sftp
@@ -191,13 +191,13 @@ Requires-Dist: together; extra == "togetherai"
191
191
  Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
- Requires-Dist: ibis; extra == "vastdb"
195
194
  Requires-Dist: vastdb; extra == "vastdb"
196
195
  Requires-Dist: pyarrow; extra == "vastdb"
196
+ Requires-Dist: ibis; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
- Requires-Dist: requests; extra == "vectara"
199
198
  Requires-Dist: aiofiles; extra == "vectara"
200
199
  Requires-Dist: httpx; extra == "vectara"
200
+ Requires-Dist: requests; extra == "vectara"
201
201
  Provides-Extra: weaviate
202
202
  Requires-Dist: weaviate-client; extra == "weaviate"
203
203
  Provides-Extra: wikipedia
@@ -15,7 +15,7 @@ test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8D
15
15
  test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
16
16
  test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
17
17
  test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
18
- test/integration/connectors/test_onedrive.py,sha256=TcMaa5BIp8J6engS4UZ2t19WQP0NNz2rkpBB47m7A3Y,3835
18
+ test/integration/connectors/test_onedrive.py,sha256=rjgN2LhaW1htEMBJPxmlP_kcRB7p_oOeZcogFlHyJH4,3721
19
19
  test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
20
20
  test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
21
21
  test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
@@ -102,7 +102,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
102
102
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
103
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
104
104
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
105
- unstructured_ingest/__version__.py,sha256=0ZfnDBlBmcWgua3sGv2Fwo28JBX-eiHGLg4rl98g_F0,42
105
+ unstructured_ingest/__version__.py,sha256=i2QrUEuUnVPQuTv5hg_JWbhbwm5k6KU4hPIFq0SIgdc,42
106
106
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
107
107
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
108
108
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -270,7 +270,7 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
270
270
  unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
271
271
  unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
272
272
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
273
- unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
273
+ unstructured_ingest/embed/azure_openai.py,sha256=u9reyZzY6BtsT5U_TdIfS6vH_42lvohVBwKMPQAqvkI,1528
274
274
  unstructured_ingest/embed/bedrock.py,sha256=50G8PBEdW3ILwyWXAWl4w-gUA9I0AR7LuFq6NLz-sWI,7284
275
275
  unstructured_ingest/embed/huggingface.py,sha256=Avcc16st9Cp2xGScG6TeNEEd3T8YjjnESNN4OdIlnh0,2119
276
276
  unstructured_ingest/embed/interfaces.py,sha256=7jsQ3rLOXy1hq__muf-EPcLnv17XzNQaD05AyGbZeNo,3739
@@ -399,7 +399,7 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-
399
399
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
400
400
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
401
401
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
402
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=-1TlqG33x_GGjGMk4Y8Psx1z6Prbuj11MMAR2WAuhBc,16520
402
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=4IwCWMlBrMpZI6V82q5nzrbyQNDVM62AQsWt6MUBWa8,16508
403
403
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
404
404
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
405
405
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
@@ -438,11 +438,12 @@ unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_Spsw
438
438
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
439
439
  unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
440
440
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
441
- unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
441
+ unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=SdcbOEUzgi1sUZJA6doZDm-a8d4F3Qtud-OVbDKW7Ng,4456
442
442
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
443
443
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
444
444
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
445
445
  unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
446
+ unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
446
447
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
447
448
  unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
448
449
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
@@ -561,9 +562,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
561
562
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
562
563
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
563
564
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
564
- unstructured_ingest-0.4.6.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
565
- unstructured_ingest-0.4.6.dist-info/METADATA,sha256=-Z6UDd_I1lUsEbYTmeBlNb4D4-e3y67LM4n75igK1tY,8051
566
- unstructured_ingest-0.4.6.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
567
- unstructured_ingest-0.4.6.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
568
- unstructured_ingest-0.4.6.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
569
- unstructured_ingest-0.4.6.dist-info/RECORD,,
565
+ unstructured_ingest-0.4.7.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
566
+ unstructured_ingest-0.4.7.dist-info/METADATA,sha256=yGcahQ8fZmoU_c1h02b76tRn5w0uj_931AAQKlFrqxs,8051
567
+ unstructured_ingest-0.4.7.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
568
+ unstructured_ingest-0.4.7.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
569
+ unstructured_ingest-0.4.7.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
570
+ unstructured_ingest-0.4.7.dist-info/RECORD,,