PyPI - unstructured-ingest - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

unstructured-ingest 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (37) hide show

unstructured_ingest/v2/processes/connectors/databricks_volumes.py CHANGED Viewed

@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
 from unstructured_ingest.enhanced_dataclass import enhanced_field
+from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
     Uploader,
     UploaderConfig,
 )
+from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
 if TYPE_CHECKING:
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
             host=self.connection_config.host, **self.connection_config.access_config.to_dict()
         )
+    def precheck(self) -> None:
+        try:
+            assert self.client.current_user.me().active
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
         for content in contents:
             with open(content.path, "rb") as elements_file:

unstructured_ingest/v2/processes/connectors/elasticsearch.py CHANGED Viewed

@@ -7,10 +7,12 @@ from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, Any, Generator, Optional
-from unstructured.documents.elements import DataSourceMetadata
 from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
-from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
 from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     UploadContent,
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
 class ElasticsearchIndexer(Indexer):
     connection_config: ElasticsearchConnectionConfig
     index_config: ElasticsearchIndexerConfig
-    client: "ElasticsearchClient" = field(init=False)
     connector_type: str = CONNECTOR_TYPE
-    def __post_init__(self):
-        self.client = self.connection_config.get_client()
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["elasticsearch"], extras="elasticsearch")
     def load_scan(self):
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
         scan = self.load_scan()
         scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
+        client = self.connection_config.get_client()
         hits = scan(
-            self.client,
+            client,
             query=scan_query,
             scroll="1m",
             index=self.index_config.index_name,
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
             yield FileData(
                 identifier=identified,
                 connector_type=CONNECTOR_TYPE,
-                metadata=DataSourceMetadata(
+                metadata=FileDataSourceMetadata(
                     url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
                     date_processed=str(time()),
                 ),
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
             file_data=FileData(
                 identifier=filename_id,
                 connector_type=CONNECTOR_TYPE,
-                metadata=DataSourceMetadata(
+                metadata=FileDataSourceMetadata(
                     version=str(result["_version"]) if "_version" in result else None,
                     date_processed=str(time()),
                     record_locator={
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
     upload_config: ElasticsearchUploaderConfig
     connection_config: ElasticsearchConnectionConfig
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["elasticsearch"], extras="elasticsearch")
     def load_parallel_bulk(self):
         from elasticsearch.helpers import parallel_bulk

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -1,14 +1,12 @@
 from __future__ import annotations
 import contextlib
-import fnmatch
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
-from unstructured.documents.elements import DataSourceMetadata
+from uuid import NAMESPACE_DNS, uuid5
 from unstructured_ingest.enhanced_dataclass import enhanced_field
 from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -19,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -73,7 +72,6 @@ class FileConfig(Base):
 @dataclass
 class FsspecIndexerConfig(FileConfig, IndexerConfig):
     recursive: bool = False
-    file_glob: Optional[list[str]] = None
 @dataclass
@@ -108,17 +106,7 @@ class FsspecIndexer(Indexer):
             **self.connection_config.get_access_config(),
         )
-    def does_path_match_glob(self, path: str) -> bool:
-        if self.index_config.file_glob is None:
-            return True
-        patterns = self.index_config.file_glob
-        for pattern in patterns:
-            if fnmatch.filter([path], pattern):
-                return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
-        return False
-    def check_connection(self):
+    def precheck(self) -> None:
         from fsspec import get_filesystem_class
         try:
@@ -156,10 +144,10 @@ class FsspecIndexer(Indexer):
             else:
                 raise TypeError(f"unhandled response type from find: {type(found)}")
-    def get_metadata(self, path: str) -> DataSourceMetadata:
+    def get_metadata(self, path: str) -> FileDataSourceMetadata:
         date_created = None
         date_modified = None
+        file_size = None
         try:
             created: Optional[Any] = self.fs.created(path)
             if created:
@@ -179,6 +167,8 @@ class FsspecIndexer(Indexer):
                     date_modified = str(modified)
         except NotImplementedError:
             pass
+        with contextlib.suppress(AttributeError):
+            file_size = self.fs.size(path)
         version = self.fs.checksum(path)
         metadata: dict[str, str] = {}
@@ -188,15 +178,19 @@ class FsspecIndexer(Indexer):
             "protocol": self.index_config.protocol,
             "remote_file_path": self.index_config.remote_url,
         }
+        file_stat = self.fs.stat(path=path)
+        if file_id := file_stat.get("id"):
+            record_locator["file_id"] = file_id
         if metadata:
             record_locator["metadata"] = metadata
-        return DataSourceMetadata(
+        return FileDataSourceMetadata(
             date_created=date_created,
             date_modified=date_modified,
             date_processed=str(time()),
             version=str(version),
             url=f"{self.index_config.protocol}://{path}",
             record_locator=record_locator,
+            filesize_bytes=file_size,
         )
     def sterilize_info(self, path) -> dict:
@@ -204,14 +198,16 @@ class FsspecIndexer(Indexer):
         return sterilize_dict(data=info)
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
-        raw_files = self.list_files()
-        files = [f for f in raw_files if self.does_path_match_glob(f)]
+        files = self.list_files()
         for file in files:
             # Note: we remove any remaining leading slashes (Box introduces these)
             # to get a valid relative path
             rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
+            additional_metadata = self.sterilize_info(path=file)
+            additional_metadata["original_file_path"] = file
             yield FileData(
-                identifier=file,
+                identifier=str(uuid5(NAMESPACE_DNS, file)),
                 connector_type=self.connector_type,
                 source_identifiers=SourceIdentifiers(
                     filename=Path(file).name,
@@ -219,7 +215,7 @@ class FsspecIndexer(Indexer):
                     fullpath=file,
                 ),
                 metadata=self.get_metadata(path=file),
-                additional_metadata=self.sterilize_info(path=file),
+                additional_metadata=additional_metadata,
             )
@@ -251,18 +247,12 @@ class FsspecDownloader(Downloader):
             **self.connection_config.get_access_config(),
         )
-    def get_download_path(self, file_data: FileData) -> Path:
-        return (
-            self.download_dir / Path(file_data.source_identifiers.relative_path)
-            if self.download_config
-            else Path(file_data.source_identifiers.rel_path)
-        )
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         try:
-            self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
+            rpath = file_data.additional_metadata["original_file_path"]
+            self.fs.get(rpath=rpath, lpath=download_path.as_posix())
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -272,7 +262,8 @@ class FsspecDownloader(Downloader):
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         try:
-            await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
+            rpath = file_data.additional_metadata["original_file_path"]
+            await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")

unstructured_ingest/v2/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -5,11 +5,15 @@ from pathlib import Path
 from time import time
 from typing import Any, Generator, Optional
-from unstructured.documents.elements import DataSourceMetadata
+from unstructured.utils import requires_dependencies
 from unstructured_ingest.enhanced_dataclass import enhanced_field
-from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import (
+    DownloadResponse,
+    FileData,
+    FileDataSourceMetadata,
+    UploadContent,
+)
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
     index_config: S3IndexerConfig
     connector_type: str = CONNECTOR_TYPE
-    def get_metadata(self, path: str) -> DataSourceMetadata:
+    def get_metadata(self, path: str) -> FileDataSourceMetadata:
         date_created = None
         date_modified = None
+        file_size = None
         try:
             modified: Optional[datetime] = self.fs.modified(path)
             if modified:
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
                 date_modified = str(modified.timestamp())
         except NotImplementedError:
             pass
+        with contextlib.suppress(AttributeError):
+            file_size = self.fs.size(path)
         version = None
         info: dict[str, Any] = self.fs.info(path)
@@ -90,13 +97,14 @@ class S3Indexer(FsspecIndexer):
         }
         if metadata:
             record_locator["metadata"] = metadata
-        return DataSourceMetadata(
+        return FileDataSourceMetadata(
             date_created=date_created,
             date_modified=date_modified,
             date_processed=str(time()),
             version=version,
             url=f"{self.index_config.protocol}://{path}",
             record_locator=record_locator,
+            filesize_bytes=file_size,
         )
     @requires_dependencies(["s3fs", "fsspec"], extras="s3")

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import io
 import os
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, Union
 from dateutil import parser
-from unstructured.documents.elements import DataSourceMetadata
 from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
 from unstructured_ingest.enhanced_dataclass import enhanced_field
-from unstructured_ingest.error import SourceConnectionNetworkError
+from unstructured_ingest.error import (
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.string_and_date_utils import json_to_dict
 from unstructured_ingest.v2.interfaces import (
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
     Downloader,
     DownloaderConfig,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
         ]
     )
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_files_service()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
     @staticmethod
     def is_dir(record: dict) -> bool:
         return record.get("mimeType") == "application/vnd.google-apps.folder"
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
             connector_type=CONNECTOR_TYPE,
             identifier=file_id,
             source_identifiers=source_identifiers,
-            metadata=DataSourceMetadata(
+            metadata=FileDataSourceMetadata(
                 url=url,
                 version=version,
                 date_created=str(date_created_dt.timestamp()),
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
     )
     connector_type: str = CONNECTOR_TYPE
-    def get_download_path(self, file_data: FileData) -> Path:
-        rel_path = file_data.source_identifiers.relative_path
-        rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
-        return self.download_dir / Path(rel_path)
     @SourceConnectionNetworkError.wrap
     def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
         downloaded = False

unstructured_ingest/v2/processes/connectors/local.py CHANGED Viewed

@@ -1,12 +1,9 @@
 import glob
-import itertools
 import shutil
 from dataclasses import dataclass, field
 from pathlib import Path
 from time import time
-from typing import Any, Generator, Optional
-from unstructured.documents.elements import DataSourceMetadata
+from typing import Any, Generator
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
 class LocalIndexerConfig(IndexerConfig):
     input_path: str
     recursive: bool = False
-    file_glob: Optional[list[str]] = None
     @property
     def path(self) -> Path:
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
         input_path = self.index_config.path
         if input_path.is_file():
             return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
-        glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
-        if not self.index_config.file_glob:
-            return list(glob_fn("*"))
-        return list(
-            itertools.chain.from_iterable(
-                glob_fn(pattern) for pattern in self.index_config.file_glob
-            )
-        )
+        if self.index_config.recursive:
+            return list(input_path.rglob("*"))
+        return list(input_path.glob("*"))
-    def get_file_metadata(self, path: Path) -> DataSourceMetadata:
+    def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
         stats = path.stat()
         try:
             date_modified = str(stats.st_mtime)
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
         except Exception as e:
             logger.warning(f"Couldn't detect file mode: {e}")
             permissions_data = None
-        return DataSourceMetadata(
+        try:
+            filesize_bytes = stats.st_size
+        except Exception as e:
+            logger.warning(f"Couldn't detect file size: {e}")
+            filesize_bytes = None
+        return FileDataSourceMetadata(
             date_modified=date_modified,
             date_created=date_created,
             date_processed=str(time()),
             permissions_data=permissions_data,
             record_locator={"path": str(path.resolve())},
+            filesize_bytes=filesize_bytes,
         )
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:

unstructured_ingest/v2/processes/connectors/mongodb.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional
 from unstructured.__version__ import __version__ as unstructured_version
 from unstructured_ingest.enhanced_dataclass import enhanced_field
+from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
@@ -85,11 +86,15 @@ class MongoDBUploaderConfig(UploaderConfig):
 class MongoDBUploader(Uploader):
     upload_config: MongoDBUploaderConfig
     connection_config: MongoDBConnectionConfig
-    client: Optional["MongoClient"] = field(init=False)
     connector_type: str = CONNECTOR_TYPE
-    def __post_init__(self):
-        self.client = self.create_client()
+    def precheck(self) -> None:
+        try:
+            client = self.create_client()
+            client.admin.command("ping")
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["pymongo"], extras="mongodb")
     def create_client(self) -> "MongoClient":
@@ -123,7 +128,8 @@ class MongoDBUploader(Uploader):
             f"collection {self.connection_config.collection} "
             f"at {self.connection_config.host}",
         )
-        db = self.client[self.connection_config.database]
+        client = self.create_client()
+        db = client[self.connection_config.database]
         collection = db[self.connection_config.collection]
         for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
             collection.insert_many(chunk)

unstructured_ingest/v2/processes/connectors/onedrive.py CHANGED Viewed

@@ -5,7 +5,6 @@ from time import time
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from dateutil import parser
-from unstructured.documents.elements import DataSourceMetadata
 from unstructured_ingest.enhanced_dataclass import enhanced_field
 from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -87,6 +87,18 @@ class OnedriveIndexer(Indexer):
     connection_config: OnedriveConnectionConfig
     index_config: OnedriveIndexerConfig
+    def precheck(self) -> None:
+        try:
+            token_resp: dict = self.connection_config.get_token()
+            if error := token_resp.get("error"):
+                raise SourceConnectionError(
+                    "{} ({})".format(error, token_resp.get("error_description"))
+                )
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
     def list_objects(self, folder, recursive) -> list["DriveItem"]:
         drive_items = folder.children.get().execute_query()
         files = [d for d in drive_items if d.is_file]
@@ -136,7 +148,7 @@ class OnedriveIndexer(Indexer):
             source_identifiers=SourceIdentifiers(
                 fullpath=server_path, filename=drive_item.name, rel_path=rel_path
             ),
-            metadata=DataSourceMetadata(
+            metadata=FileDataSourceMetadata(
                 url=drive_item.parent_reference.path + "/" + drive_item.name,
                 version=drive_item.etag,
                 date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -123,9 +123,12 @@ class PineconeUploader(Uploader):
     connection_config: PineconeConnectionConfig
     connector_type: str = CONNECTOR_TYPE
-    @DestinationConnectionError.wrap
-    def check_connection(self):
-        _ = self.connection_config.get_index()
+    def precheck(self):
+        try:
+            self.connection_config.get_index()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["pinecone"], extras="pinecone")
     def upsert_batch(self, batch):

unstructured_ingest/v2/processes/connectors/salesforce.py CHANGED Viewed

@@ -18,10 +18,9 @@ from textwrap import dedent
 from typing import TYPE_CHECKING, Any, Generator, Type
 from dateutil import parser
-from unstructured.documents.elements import DataSourceMetadata
 from unstructured_ingest.enhanced_dataclass import enhanced_field
-from unstructured_ingest.error import SourceConnectionNetworkError
+from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -132,6 +132,13 @@ class SalesforceIndexer(Indexer):
             if record_type not in ACCEPTED_CATEGORIES:
                 raise ValueError(f"{record_type} not currently an accepted Salesforce category")
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
     def get_file_extension(self, record_type) -> str:
         if record_type == "EmailMessage":
             extension = ".eml"
@@ -172,7 +179,7 @@ class SalesforceIndexer(Indexer):
                                 filename=record_with_extension,
                                 fullpath=f"{record['attributes']['type']}/{record_with_extension}",
                             ),
-                            metadata=DataSourceMetadata(
+                            metadata=FileDataSourceMetadata(
                                 url=record["attributes"]["url"],
                                 version=str(parser.parse(record["SystemModstamp"]).timestamp()),
                                 date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
@@ -207,11 +214,6 @@ class SalesforceDownloader(Downloader):
     )
     connector_type: str = CONNECTOR_TYPE
-    def get_download_path(self, file_data: FileData) -> Path:
-        rel_path = file_data.source_identifiers.relative_path
-        rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
-        return self.download_dir / Path(rel_path)
     def _xml_for_record(self, record: OrderedDict) -> str:
         """Creates partitionable xml file from a record"""
         import xml.etree.ElementTree as ET

unstructured_ingest/v2/processes/connectors/sharepoint.py CHANGED Viewed

@@ -6,10 +6,8 @@ from time import time
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from urllib.parse import quote
-from unstructured.documents.elements import DataSourceMetadata
 from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
-from unstructured_ingest.error import SourceConnectionNetworkError
+from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -134,6 +133,14 @@ class SharepointIndexer(Indexer):
     connection_config: SharepointConnectionConfig
     index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
+    def precheck(self) -> None:
+        try:
+            site_client = self.connection_config.get_client()
+            site_client.site_pages.pages.get().execute_query()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
     def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
         if not recursive:
             folder.expand(["Files"]).get().execute_query()
@@ -187,7 +194,7 @@ class SharepointIndexer(Indexer):
                 fullpath=file_path,
                 rel_path=file_path.replace(self.index_config.path, ""),
             ),
-            metadata=DataSourceMetadata(
+            metadata=FileDataSourceMetadata(
                 url=url,
                 version=version,
                 date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -222,7 +229,7 @@ class SharepointIndexer(Indexer):
                 fullpath=fullpath,
                 rel_path=rel_path,
             ),
-            metadata=DataSourceMetadata(
+            metadata=FileDataSourceMetadata(
                 url=absolute_url,
                 version=f"{file.major_version}.{file.minor_version}",
                 date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -340,10 +347,9 @@ class SharepointDownloader(Downloader):
     connector_type: str = CONNECTOR_TYPE
     def get_download_path(self, file_data: FileData) -> Path:
+        download_path = super().get_download_path(file_data=file_data)
         content_type = file_data.additional_metadata.get("sharepoint_content_type")
-        rel_path = file_data.source_identifiers.fullpath
-        rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
-        download_path = self.download_dir / Path(rel_path)
         if content_type == SharepointContentType.SITEPAGE.value:
             # Update output extension to html if site page
             download_path = download_path.with_suffix(".html")

unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl