PyPI - unstructured-ingest - Versions diffs - 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

unstructured-ingest 0.4.6py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (12) hide show

test/integration/connectors/test_onedrive.py CHANGED Viewed

@@ -18,9 +18,6 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
 @pytest.fixture
-@pytest.mark.xfail(
-    reason="Issues with test setup on the provider side."
-)  # TODO: remove line when issues are addressed
 def onedrive_test_folder() -> str:
     """
     Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
@@ -67,9 +64,6 @@ def get_connection_config():
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
 @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
-@pytest.mark.xfail(
-    reason="Issues with test setup on the provider side."
-)  # TODO: remove line when issues are addressed
 def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
     """
     Integration test for the OneDrive destination connector.
@@ -107,10 +101,14 @@ def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
     client = connection_config.get_client()
     drive = client.users[user_pname].drive
+    # Workaround: File should not have .json in the metadata.filename it comes from embedder
     uploaded_file = (
-        drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
+        drive.root.get_by_path(f"{destination_fullpath}.json")
+        .select(["id", "name"])
+        .get()
+        .execute_query()
     )
     # Check if the file exists
     assert uploaded_file is not None
-    assert uploaded_file.name == upload_file.name
+    assert uploaded_file.name == f"{upload_file.name}.json"

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.6" # pragma: no cover
1	+ __version__ = "0.4.7" # pragma: no cover

unstructured_ingest/embed/azure_openai.py CHANGED Viewed

@@ -3,11 +3,15 @@ from typing import TYPE_CHECKING
 from pydantic import Field
-from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
+from unstructured_ingest.embed.openai import (
+    AsyncOpenAIEmbeddingEncoder,
+    OpenAIEmbeddingConfig,
+    OpenAIEmbeddingEncoder,
+)
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from openai import AzureOpenAI
+    from openai import AsyncAzureOpenAI, AzureOpenAI
 class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
@@ -25,7 +29,22 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
             azure_endpoint=self.azure_endpoint,
         )
+    @requires_dependencies(["openai"], extras="openai")
+    def get_async_client(self) -> "AsyncAzureOpenAI":
+        from openai import AsyncAzureOpenAI
+        return AsyncAzureOpenAI(
+            api_key=self.api_key.get_secret_value(),
+            api_version=self.api_version,
+            azure_endpoint=self.azure_endpoint,
+        )
 @dataclass
 class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
     config: AzureOpenAIEmbeddingConfig
+@dataclass
+class AsyncAzureOpenAIEmbeddingEncoder(AsyncOpenAIEmbeddingEncoder):
+    config: AzureOpenAIEmbeddingConfig

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -329,9 +329,9 @@ class Pipeline:
         source_entry = {
             k: v
             for k, v in source_registry.items()
-            if isinstance(indexer_config, v.indexer_config)
-            and isinstance(downloader_config, v.downloader_config)
-            and isinstance(source_connection_config, v.connection_config)
+            if type(indexer_config) is v.indexer_config
+            and type(downloader_config) is v.downloader_config
+            and type(source_connection_config) is v.connection_config
         }
         if len(source_entry) > 1:
             raise ValueError(

unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json ADDED Viewed

@@ -0,0 +1,23 @@
+{
+    "properties": [
+        {
+            "dataType": [
+                "text"
+            ],
+            "indexFilterable": true,
+            "indexSearchable": true,
+            "name": "record_id",
+            "tokenization": "word"
+        },
+        {
+            "dataType": [
+                "text"
+            ],
+            "indexFilterable": true,
+            "indexSearchable": true,
+            "name": "text",
+            "tokenization": "word"
+        }
+    ],
+    "vectorizer": "none"
+}

unstructured_ingest/v2/processes/connectors/sharepoint.py CHANGED Viewed

@@ -1,85 +1,43 @@
-import json
-from dataclasses import dataclass, field
-from enum import Enum
-from pathlib import Path
-from time import time
-from typing import TYPE_CHECKING, Any, Generator, Optional
-from urllib.parse import quote
+from __future__ import annotations
-from pydantic import BaseModel, Field, Secret, SecretStr
+import asyncio
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, AsyncIterator
-from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
+from pydantic import Field
+from unstructured_ingest.error import (
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
-    AccessConfig,
-    ConnectionConfig,
-    Downloader,
-    DownloaderConfig,
-    DownloadResponse,
     FileData,
-    FileDataSourceMetadata,
-    Indexer,
-    IndexerConfig,
-    SourceIdentifiers,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
     SourceRegistryEntry,
 )
-from .utils import parse_datetime
+from unstructured_ingest.v2.processes.connectors.onedrive import (
+    OnedriveAccessConfig,
+    OnedriveConnectionConfig,
+    OnedriveDownloader,
+    OnedriveDownloaderConfig,
+    OnedriveIndexer,
+    OnedriveIndexerConfig,
+)
 if TYPE_CHECKING:
-    from office365.graph_client import GraphClient
     from office365.onedrive.driveitems.driveItem import DriveItem
-    from office365.onedrive.drives.drive import Drive
-    from office365.onedrive.permissions.permission import Permission
-    from office365.onedrive.sites.site import Site
-    from office365.sharepoint.client_context import ClientContext
-    from office365.sharepoint.files.file import File
-    from office365.sharepoint.folders.folder import Folder
-    from office365.sharepoint.publishing.pages.page import SitePage
 CONNECTOR_TYPE = "sharepoint"
-MAX_MB_SIZE = 512_000_000
-# TODO handle other data types possible from Sharepoint
-# exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
-class SharepointContentType(Enum):
-    DOCUMENT = "document"
-    SITEPAGE = "site_page"
-    LIST = "list"
-class SharepointAccessConfig(AccessConfig):
-    client_cred: str = Field(description="Sharepoint app secret")
+class SharepointAccessConfig(OnedriveAccessConfig):
+    client_cred: str = Field(description="Microsoft App client secret")
-class SharepointPermissionsConfig(BaseModel):
-    permissions_application_id: Optional[str] = Field(
-        default=None, description="Microsoft Graph API application id"
-    )
-    permissions_tenant: Optional[str] = Field(
-        default=None,
-        description="url to get permissions data within tenant.",
-        examples=["https://contoso.onmicrosoft.com"],
-    )
-    permissions_client_cred: Optional[SecretStr] = Field(
-        default=None, description="Microsoft Graph API application credentials"
-    )
-    authority_url: Optional[SecretStr] = Field(
-        repr=False,
-        default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
-        description="Permissions authority url",
-        examples=["https://login.microsoftonline.com"],
-    )
-class SharepointConnectionConfig(ConnectionConfig):
-    client_id: str = Field(description="Sharepoint app client ID")
+class SharepointConnectionConfig(OnedriveConnectionConfig):
     site: str = Field(
         description="Sharepoint site url. Process either base url e.g \
                     https://[tenant].sharepoint.com  or relative sites \
@@ -88,355 +46,75 @@ class SharepointConnectionConfig(ConnectionConfig):
                     https://[tenant]-admin.sharepoint.com.\
                     This requires the app to be registered at a tenant level"
     )
-    access_config: Secret[SharepointAccessConfig]
-    permissions_config: Optional[SharepointPermissionsConfig] = None
-    @requires_dependencies(["office365"], extras="sharepoint")
-    def get_client(self) -> "ClientContext":
-        from office365.runtime.auth.client_credential import ClientCredential
-        from office365.sharepoint.client_context import ClientContext
-        try:
-            credentials = ClientCredential(
-                self.client_id, self.access_config.get_secret_value().client_cred
-            )
-            site_client = ClientContext(self.site).with_credentials(credentials)
-        except Exception as e:
-            logger.error(f"Couldn't set Sharepoint client: {e}")
-            raise e
-        return site_client
-    @requires_dependencies(["msal"], extras="sharepoint")
-    def get_permissions_token(self):
-        from msal import ConfidentialClientApplication
-        try:
-            client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
-            app = ConfidentialClientApplication(
-                authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
-                f"{self.permissions_config.permissions_tenant}",
-                client_id=self.permissions_config.permissions_application_id,
-                client_credential=client_credential,
-            )
-            token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
-        except ValueError as exc:
-            logger.error("Couldn't set up credentials for Sharepoint")
-            raise exc
-        if "error" in token:
-            raise SourceConnectionNetworkError(
-                "failed to fetch token, {}: {}".format(token["error"], token["error_description"])
-            )
-        return token
-    @requires_dependencies(["office365"], extras="sharepoint")
-    def get_permissions_client(self) -> Optional["GraphClient"]:
-        from office365.graph_client import GraphClient
-        if self.permissions_config is None:
-            return None
-        client = GraphClient(self.get_permissions_token)
-        return client
-class SharepointIndexerConfig(IndexerConfig):
-    path: Optional[str] = Field(
-        default=None,
-        description="Path from which to start parsing files. If the connector is to \
-                process all sites within the tenant this filter will be applied to \
-                all sites document libraries.",
-    )
-    recursive: bool = Field(
-        default=False,
-        description="Recursively download files in their respective folders "
-        "otherwise stop at the files in provided folder level.",
-    )
-    omit_files: bool = Field(default=False, description="Don't process files.")
-    omit_pages: bool = Field(default=False, description="Don't process site pages.")
-    omit_lists: bool = Field(default=False, description="Don't process lists.")
+class SharepointIndexerConfig(OnedriveIndexerConfig):
+    pass
 @dataclass
-class SharepointIndexer(Indexer):
+class SharepointIndexer(OnedriveIndexer):
     connection_config: SharepointConnectionConfig
-    index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
+    index_config: SharepointIndexerConfig
-    def precheck(self) -> None:
-        try:
-            site_client = self.connection_config.get_client()
-            site_client.site_pages.pages.get().execute_query()
-        except Exception as e:
-            logger.error(f"failed to validate connection: {e}", exc_info=True)
-            raise SourceConnectionError(f"failed to validate connection: {e}")
-    def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
-        if not recursive:
-            folder.expand(["Files"]).get().execute_query()
-            return folder.files
-        folder.expand(["Files", "Folders"]).get().execute_query()
-        files: list["File"] = list(folder.files)
-        folders: list["Folder"] = list(folder.folders)
-        for f in folders:
-            if "/Forms" in f.serverRelativeUrl:
-                continue
-            files.extend(self.list_files(f, recursive))
-        return files
-    def get_properties(self, raw_properties: dict) -> dict:
-        raw_properties = {k: v for k, v in raw_properties.items() if v}
-        filtered_properties = {}
-        for k, v in raw_properties.items():
-            try:
-                json.dumps(v)
-                filtered_properties[k] = v
-            except TypeError:
-                pass
-        return filtered_properties
-    def list_pages(self, client: "ClientContext") -> list["SitePage"]:
-        pages = client.site_pages.pages.get().execute_query()
-        return pages
-    def page_to_file_data(self, site_page: "SitePage") -> FileData:
-        site_page.expand(site_page.properties.keys()).get().execute_query()
-        version = site_page.properties.get("Version", None)
-        unique_id = site_page.properties.get("UniqueId", None)
-        modified_date = site_page.properties.get("Modified", None)
-        url = site_page.properties.get("AbsoluteUrl", None)
-        date_modified_dt = parse_datetime(modified_date) if modified_date else None
-        date_created_at = (
-            parse_datetime(site_page.first_published)
-            if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
-            else None
-        )
-        file_path = site_page.get_property("Url", "")
-        server_path = file_path if file_path[0] != "/" else file_path[1:]
-        additional_metadata = self.get_properties(raw_properties=site_page.properties)
-        additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
-        return FileData(
-            identifier=unique_id,
-            connector_type=CONNECTOR_TYPE,
-            source_identifiers=SourceIdentifiers(
-                filename=site_page.file_name,
-                fullpath=file_path,
-                rel_path=file_path.replace(self.index_config.path, ""),
-            ),
-            metadata=FileDataSourceMetadata(
-                url=url,
-                version=version,
-                date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
-                date_created=str(date_created_at.timestamp()) if date_created_at else None,
-                date_processed=str(time()),
-                record_locator={
-                    "server_path": server_path,
-                },
-            ),
-            additional_metadata=additional_metadata,
-        )
-    def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
-        file.expand(file.properties.keys()).get().execute_query()
-        absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
-        date_modified_dt = (
-            parse_datetime(file.time_last_modified) if file.time_last_modified else None
-        )
-        date_created_at = parse_datetime(file.time_created) if file.time_created else None
-        additional_metadata = self.get_properties(raw_properties=file.properties)
-        additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
-        fullpath = str(file.serverRelativeUrl)
-        rel_path = fullpath.replace(self.index_config.path, "")
-        while rel_path[0] == "/":
-            rel_path = rel_path[1:]
-        return FileData(
-            identifier=file.unique_id,
-            connector_type=CONNECTOR_TYPE,
-            source_identifiers=SourceIdentifiers(
-                filename=file.name,
-                fullpath=fullpath,
-                rel_path=rel_path,
-            ),
-            metadata=FileDataSourceMetadata(
-                url=absolute_url,
-                version=f"{file.major_version}.{file.minor_version}",
-                date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
-                date_created=str(date_created_at.timestamp()) if date_created_at else None,
-                date_processed=str(time()),
-                record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
-            ),
-            additional_metadata=additional_metadata,
-        )
-    def get_root(self, client: "ClientContext") -> "Folder":
-        if path := self.index_config.path:
-            return client.web.get_folder_by_server_relative_path(path)
-        default_document_library = client.web.default_document_library()
-        root_folder = default_document_library.root_folder
-        root_folder = root_folder.get().execute_query()
-        self.index_config.path = root_folder.name
-        return root_folder
-    def get_site_url(self, client: "ClientContext") -> str:
-        res = client.web.get().execute_query()
-        return res.url
-    def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
-        return permissions_client.sites.get_by_url(url=site_url).execute_query()
-    def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
-        # TODO find a way to narrow this search down by name of drive
-        items: list["DriveItem"] = []
-        drives: list["Drive"] = site.drives.get_all().execute_query()
-        for drive in drives:
-            items.extend(drive.root.children.get_all().execute_query())
-        return items
+    @requires_dependencies(["office365"], extras="sharepoint")
+    async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
+        from office365.runtime.client_request_exception import ClientRequestException
-    def map_permission(self, permission: "Permission") -> dict:
-        return {
-            "id": permission.id,
-            "roles": list(permission.roles),
-            "share_id": permission.share_id,
-            "has_password": permission.has_password,
-            "link": permission.link.to_json(),
-            "granted_to_identities": permission.granted_to_identities.to_json(),
-            "granted_to": permission.granted_to.to_json(),
-            "granted_to_v2": permission.granted_to_v2.to_json(),
-            "granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
-            "invitation": permission.invitation.to_json(),
-        }
+        token_resp = await asyncio.to_thread(self.connection_config.get_token)
+        if "error" in token_resp:
+            raise SourceConnectionError(
+                f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
+            )
-    def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None:
-        logger.debug("Enriching permissions on files")
-        permission_client = self.connection_config.get_permissions_client()
-        if permission_client is None:
-            return
-        site = self.get_site(permissions_client=permission_client, site_url=site_url)
-        existing_items = self.get_permissions_items(site=site)
-        for file_data in all_file_data:
-            etag = file_data.additional_metadata.get("ETag")
-            if not etag:
-                continue
-            matching_items = list(filter(lambda x: x.etag == etag, existing_items))
-            if not matching_items:
-                continue
-            if len(matching_items) > 1:
-                logger.warning(
-                    "Found multiple drive items with etag matching {}, skipping: {}".format(
-                        etag, ", ".join([i.name for i in matching_items])
-                    )
-                )
-                continue
-            matching_item = matching_items[0]
-            permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
-            permissions_data = [
-                self.map_permission(permission=permission) for permission in permissions
-            ]
-            file_data.metadata.permissions_data = permissions_data
+        client = await asyncio.to_thread(self.connection_config.get_client)
+        try:
+            site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
+            site_drive_item = site.drive.get().execute_query().root
+        except ClientRequestException:
+            logger.info("Site not found")
-    @property
-    def process_permissions(self) -> bool:
-        return (
-            self.connection_config.permissions_config is not None
-            and self.connection_config.permissions_config.permissions_tenant
-            and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
-            and self.connection_config.permissions_config.permissions_application_id
+        drive_items = await self.list_objects(
+            folder=site_drive_item, recursive=self.index_config.recursive
         )
-    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
-        client = self.connection_config.get_client()
-        root_folder = self.get_root(client=client)
-        logger.debug(f"processing content from path: {self.index_config.path}")
-        if not self.index_config.omit_files:
-            files = self.list_files(root_folder, recursive=self.index_config.recursive)
-            file_data = [self.file_to_file_data(file=file, client=client) for file in files]
-            if self.process_permissions:
-                self.enrich_permissions_on_files(
-                    all_file_data=file_data, site_url=self.get_site_url(client=client)
-                )
-            for file in file_data:
-                yield file
-        if not self.index_config.omit_pages:
-            pages = self.list_pages(client=client)
-            for page in pages:
-                file_data = self.page_to_file_data(site_page=page)
-                file_data.metadata.record_locator["site_url"] = client.base_url
-                yield file_data
+        for drive_item in drive_items:
+            file_data = await self.drive_item_to_file_data(drive_item=drive_item)
+            yield file_data
-class SharepointDownloaderConfig(DownloaderConfig):
+class SharepointDownloaderConfig(OnedriveDownloaderConfig):
     pass
 @dataclass
-class SharepointDownloader(Downloader):
+class SharepointDownloader(OnedriveDownloader):
     connection_config: SharepointConnectionConfig
     download_config: SharepointDownloaderConfig
-    connector_type: str = CONNECTOR_TYPE
-    def get_download_path(self, file_data: FileData) -> Path:
-        download_path = super().get_download_path(file_data=file_data)
-        content_type = file_data.additional_metadata.get("sharepoint_content_type")
-        if content_type == SharepointContentType.SITEPAGE.value:
-            # Update output extension to html if site page
-            download_path = download_path.with_suffix(".html")
-        return download_path
-    def get_document(self, file_data: FileData) -> DownloadResponse:
-        client: "ClientContext" = self.connection_config.get_client()
-        file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
-        download_path = self.get_download_path(file_data=file_data)
-        download_path.parent.mkdir(parents=True, exist_ok=True)
-        logger.debug(
-            f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
-        )
-        with download_path.open("wb") as f:
-            file.download(f).execute_query()
-        return self.generate_download_response(file_data=file_data, download_path=download_path)
+    @SourceConnectionNetworkError.wrap
+    @requires_dependencies(["office365"], extras="onedrive")
+    def _fetch_file(self, file_data: FileData) -> DriveItem:
+        from office365.runtime.client_request_exception import ClientRequestException
-    def get_site_page(self, file_data: FileData) -> DownloadResponse:
-        # TODO fetch comments for site page as well
-        from lxml import etree, html
-        canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
-        layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
-        html_content = []
-        if layout_web_parts_content_raw:
-            layout_web_parts_content = json.loads(layout_web_parts_content_raw)
-            for web_part in layout_web_parts_content:
-                properties = web_part.get("properties", {})
-                if title := properties.get("title"):
-                    html_content.append(f"<title>{title}</title>")
-        if canvas_content_raw:
-            canvas_content = json.loads(canvas_content_raw)
-            for content in canvas_content:
-                if inner_html := content.get("innerHTML"):
-                    html_content.append(inner_html)
-        htmls = "".join(html_content)
-        content = f"<div>{htmls}</div>"
-        document = html.fromstring(content)
-        download_path = self.get_download_path(file_data=file_data)
-        download_path.parent.mkdir(parents=True, exist_ok=True)
-        logger.debug(
-            f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
-        )
-        with download_path.open("w") as f:
-            f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
-        return self.generate_download_response(file_data=file_data, download_path=download_path)
-    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
-        content_type = file_data.additional_metadata.get("sharepoint_content_type")
-        if not content_type:
+        if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
             raise ValueError(
-                f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}"
+                f"file data doesn't have enough information to get "
+                f"file content: {file_data.model_dump()}"
             )
-        if content_type == SharepointContentType.DOCUMENT.value:
-            return self.get_document(file_data=file_data)
-        elif content_type == SharepointContentType.SITEPAGE.value:
-            return self.get_site_page(file_data=file_data)
-        else:
-            raise ValueError(f"content type not recognized: {content_type}")
+        server_relative_path = file_data.source_identifiers.fullpath
+        client = self.connection_config.get_client()
+        try:
+            site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
+            site_drive_item = site.drive.get().execute_query().root
+        except ClientRequestException:
+            logger.info("Site not found")
+        file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
+        if not file:
+            raise FileNotFoundError(f"file not found: {server_relative_path}")
+        return file
 sharepoint_source_entry = SourceRegistryEntry(

{unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.4.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: unstructured-ingest
-Version: 0.4.6
+Version: 0.4.7
 Summary: A library that prepares raw documents for downstream ML tasks.
 Home-page: https://github.com/Unstructured-IO/unstructured-ingest
 Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9.0,<3.14
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
+Requires-Dist: pandas
 Requires-Dist: pydantic>=2.7
 Requires-Dist: dataclasses-json
 Requires-Dist: python-dateutil
-Requires-Dist: click
-Requires-Dist: tqdm
-Requires-Dist: pandas
 Requires-Dist: opentelemetry-sdk
+Requires-Dist: tqdm
+Requires-Dist: click
 Provides-Extra: airtable
 Requires-Dist: pyairtable; extra == "airtable"
 Provides-Extra: astradb
@@ -39,21 +39,21 @@ Requires-Dist: adlfs; extra == "azure"
 Provides-Extra: azure-ai-search
 Requires-Dist: azure-search-documents; extra == "azure-ai-search"
 Provides-Extra: bedrock
-Requires-Dist: aioboto3; extra == "bedrock"
 Requires-Dist: boto3; extra == "bedrock"
+Requires-Dist: aioboto3; extra == "bedrock"
 Provides-Extra: biomed
-Requires-Dist: bs4; extra == "biomed"
 Requires-Dist: requests; extra == "biomed"
+Requires-Dist: bs4; extra == "biomed"
 Provides-Extra: box
-Requires-Dist: boxfs; extra == "box"
 Requires-Dist: fsspec; extra == "box"
+Requires-Dist: boxfs; extra == "box"
 Provides-Extra: chroma
 Requires-Dist: chromadb; extra == "chroma"
 Provides-Extra: clarifai
 Requires-Dist: clarifai; extra == "clarifai"
 Provides-Extra: confluence
-Requires-Dist: atlassian-python-api; extra == "confluence"
 Requires-Dist: requests; extra == "confluence"
+Requires-Dist: atlassian-python-api; extra == "confluence"
 Provides-Extra: couchbase
 Requires-Dist: couchbase; extra == "couchbase"
 Provides-Extra: csv
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
 Provides-Extra: databricks-volumes
 Requires-Dist: databricks-sdk; extra == "databricks-volumes"
 Provides-Extra: delta-table
-Requires-Dist: deltalake; extra == "delta-table"
 Requires-Dist: boto3; extra == "delta-table"
+Requires-Dist: deltalake; extra == "delta-table"
 Provides-Extra: discord
 Requires-Dist: discord.py; extra == "discord"
 Provides-Extra: doc
@@ -92,12 +92,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
 Provides-Extra: epub
 Requires-Dist: unstructured[epub]; extra == "epub"
 Provides-Extra: gcs
+Requires-Dist: fsspec; extra == "gcs"
 Requires-Dist: bs4; extra == "gcs"
 Requires-Dist: gcsfs; extra == "gcs"
-Requires-Dist: fsspec; extra == "gcs"
 Provides-Extra: github
-Requires-Dist: pygithub>1.58.0; extra == "github"
 Requires-Dist: requests; extra == "github"
+Requires-Dist: pygithub>1.58.0; extra == "github"
 Provides-Extra: gitlab
 Requires-Dist: python-gitlab; extra == "gitlab"
 Provides-Extra: google-drive
@@ -122,20 +122,20 @@ Requires-Dist: pymongo; extra == "mongodb"
 Provides-Extra: msg
 Requires-Dist: unstructured[msg]; extra == "msg"
 Provides-Extra: neo4j
-Requires-Dist: neo4j; extra == "neo4j"
-Requires-Dist: cymple; extra == "neo4j"
 Requires-Dist: networkx; extra == "neo4j"
+Requires-Dist: cymple; extra == "neo4j"
+Requires-Dist: neo4j; extra == "neo4j"
 Provides-Extra: notion
-Requires-Dist: notion-client; extra == "notion"
-Requires-Dist: backoff; extra == "notion"
 Requires-Dist: httpx; extra == "notion"
 Requires-Dist: htmlBuilder; extra == "notion"
+Requires-Dist: notion-client; extra == "notion"
+Requires-Dist: backoff; extra == "notion"
 Provides-Extra: odt
 Requires-Dist: unstructured[odt]; extra == "odt"
 Provides-Extra: onedrive
-Requires-Dist: bs4; extra == "onedrive"
 Requires-Dist: msal; extra == "onedrive"
 Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
+Requires-Dist: bs4; extra == "onedrive"
 Provides-Extra: openai
 Requires-Dist: tiktoken; extra == "openai"
 Requires-Dist: openai; extra == "openai"
@@ -169,8 +169,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
 Provides-Extra: rtf
 Requires-Dist: unstructured[rtf]; extra == "rtf"
 Provides-Extra: s3
-Requires-Dist: s3fs; extra == "s3"
 Requires-Dist: fsspec; extra == "s3"
+Requires-Dist: s3fs; extra == "s3"
 Provides-Extra: salesforce
 Requires-Dist: simple-salesforce; extra == "salesforce"
 Provides-Extra: sftp
@@ -191,13 +191,13 @@ Requires-Dist: together; extra == "togetherai"
 Provides-Extra: tsv
 Requires-Dist: unstructured[tsv]; extra == "tsv"
 Provides-Extra: vastdb
-Requires-Dist: ibis; extra == "vastdb"
 Requires-Dist: vastdb; extra == "vastdb"
 Requires-Dist: pyarrow; extra == "vastdb"
+Requires-Dist: ibis; extra == "vastdb"
 Provides-Extra: vectara
-Requires-Dist: requests; extra == "vectara"
 Requires-Dist: aiofiles; extra == "vectara"
 Requires-Dist: httpx; extra == "vectara"
+Requires-Dist: requests; extra == "vectara"
 Provides-Extra: weaviate
 Requires-Dist: weaviate-client; extra == "weaviate"
 Provides-Extra: wikipedia

{unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.4.7.dist-info}/RECORD RENAMED Viewed

@@ -15,7 +15,7 @@ test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8D
 test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
 test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
 test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
-test/integration/connectors/test_onedrive.py,sha256=TcMaa5BIp8J6engS4UZ2t19WQP0NNz2rkpBB47m7A3Y,3835
+test/integration/connectors/test_onedrive.py,sha256=rjgN2LhaW1htEMBJPxmlP_kcRB7p_oOeZcogFlHyJH4,3721
 test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
 test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
 test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
@@ -102,7 +102,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
 test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=0ZfnDBlBmcWgua3sGv2Fwo28JBX-eiHGLg4rl98g_F0,42
+unstructured_ingest/__version__.py,sha256=i2QrUEuUnVPQuTv5hg_JWbhbwm5k6KU4hPIFq0SIgdc,42
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
 unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -270,7 +270,7 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
 unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
 unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
 unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
+unstructured_ingest/embed/azure_openai.py,sha256=u9reyZzY6BtsT5U_TdIfS6vH_42lvohVBwKMPQAqvkI,1528
 unstructured_ingest/embed/bedrock.py,sha256=50G8PBEdW3ILwyWXAWl4w-gUA9I0AR7LuFq6NLz-sWI,7284
 unstructured_ingest/embed/huggingface.py,sha256=Avcc16st9Cp2xGScG6TeNEEd3T8YjjnESNN4OdIlnh0,2119
 unstructured_ingest/embed/interfaces.py,sha256=7jsQ3rLOXy1hq__muf-EPcLnv17XzNQaD05AyGbZeNo,3739
@@ -399,7 +399,7 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-
 unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
 unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
-unstructured_ingest/v2/pipeline/pipeline.py,sha256=-1TlqG33x_GGjGMk4Y8Psx1z6Prbuj11MMAR2WAuhBc,16520
+unstructured_ingest/v2/pipeline/pipeline.py,sha256=4IwCWMlBrMpZI6V82q5nzrbyQNDVM62AQsWt6MUBWa8,16508
 unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
 unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
@@ -438,11 +438,12 @@ unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_Spsw
 unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
 unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
 unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
-unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
+unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=SdcbOEUzgi1sUZJA6doZDm-a8d4F3Qtud-OVbDKW7Ng,4456
 unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
 unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
 unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
 unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
 unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
 unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
 unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
@@ -561,9 +562,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
 unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
 unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
 unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
-unstructured_ingest-0.4.6.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-0.4.6.dist-info/METADATA,sha256=-Z6UDd_I1lUsEbYTmeBlNb4D4-e3y67LM4n75igK1tY,8051
-unstructured_ingest-0.4.6.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-unstructured_ingest-0.4.6.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-0.4.6.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
-unstructured_ingest-0.4.6.dist-info/RECORD,,
+unstructured_ingest-0.4.7.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-0.4.7.dist-info/METADATA,sha256=yGcahQ8fZmoU_c1h02b76tRn5w0uj_931AAQKlFrqxs,8051
+unstructured_ingest-0.4.7.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+unstructured_ingest-0.4.7.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-0.4.7.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
+unstructured_ingest-0.4.7.dist-info/RECORD,,

{unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.4.7.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.4.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.4.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.4.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

unstructured-ingest 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.4.6py3-none-any.whl → 0.4.7py3-none-any.whl