PyPI - unstructured-ingest - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

unstructured-ingest 0.0.0py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show

unstructured_ingest/v2/pipeline/steps/filter.py ADDED Viewed

@@ -0,0 +1,40 @@
+import asyncio
+from dataclasses import dataclass
+from typing import Callable, Optional
+from unstructured_ingest.v2.interfaces.file_data import FileData
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
+from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.processes.filter import Filterer
+STEP_ID = "filter"
+@dataclass
+class FilterStep(PipelineStep):
+    process: Filterer
+    identifier: str = STEP_ID
+    def __post_init__(self):
+        config = (
+            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
+            if self.process.config
+            else None
+        )
+        logger.info(f"Created {self.identifier} with configs: {config}")
+    async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
+        file_data = FileData.from_file(path=file_data_path)
+        fn_kwargs = {"file_data": file_data}
+        if not asyncio.iscoroutinefunction(fn):
+            resp = fn(**fn_kwargs)
+        elif semaphore := self.context.semaphore:
+            async with semaphore:
+                resp = await fn(**fn_kwargs)
+        else:
+            resp = await fn(**fn_kwargs)
+        if resp:
+            return {"file_data_path": file_data_path}
+        return None

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
-import unstructured.ingest.v2.processes.connectors.fsspec  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.fsspec  # noqa: F401
 from unstructured_ingest.v2.processes.connector_registry import (
     add_destination_entry,
     add_source_entry,
@@ -19,6 +18,8 @@ from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
 from .google_drive import google_drive_source_entry
 from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
 from .local import local_destination_entry, local_source_entry
+from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
+from .milvus import milvus_destination_entry
 from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
 from .mongodb import mongodb_destination_entry
 from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
@@ -75,3 +76,4 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
 add_destination_entry(
     destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
 )
+add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)

unstructured_ingest/v2/processes/connectors/astra.py CHANGED Viewed

@@ -7,6 +7,7 @@ from unstructured import __name__ as integration_name
 from unstructured.__version__ import __version__ as integration_version
 from unstructured_ingest.enhanced_dataclass import enhanced_field
+from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
@@ -94,6 +95,13 @@ class AstraUploader(Uploader):
     upload_config: AstraUploaderConfig
     connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        try:
+            self.get_collection()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["astrapy"], extras="astra")
     def get_collection(self) -> "AstraDBCollection":
         from astrapy.db import AstraDB

unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py CHANGED Viewed

@@ -175,6 +175,14 @@ class AzureCognitiveSearchUploader(Uploader):
                 ),
             )
+    def precheck(self) -> None:
+        try:
+            client = self.connection_config.generate_client()
+            client.get_document_count()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     def write_dict_wrapper(self, elements_dict):
         return self.write_dict(elements_dict=elements_dict)

unstructured_ingest/v2/processes/connectors/chroma.py CHANGED Viewed

@@ -111,10 +111,13 @@ class ChromaUploader(Uploader):
     connector_type: str = CONNECTOR_TYPE
     upload_config: ChromaUploaderConfig
     connection_config: ChromaConnectionConfig
-    client: Optional["Client"] = field(init=False)
-    def __post_init__(self):
-        self.client = self.create_client()
+    def precheck(self) -> None:
+        try:
+            self.create_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["chromadb"], extras="chroma")
     def create_client(self) -> "Client":
@@ -187,10 +190,9 @@ class ChromaUploader(Uploader):
             f"collection {self.connection_config.collection_name} "
             f"at {self.connection_config.host}",
         )
+        client = self.create_client()
-        collection = self.client.get_or_create_collection(
-            name=self.connection_config.collection_name
-        )
+        collection = client.get_or_create_collection(name=self.connection_config.collection_name)
         for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
             self.upsert_batch(collection, self.prepare_chroma_list(chunk))

unstructured_ingest/v2/processes/connectors/databricks_volumes.py CHANGED Viewed

@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
 from unstructured_ingest.enhanced_dataclass import enhanced_field
+from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
     Uploader,
     UploaderConfig,
 )
+from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
 if TYPE_CHECKING:
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
             host=self.connection_config.host, **self.connection_config.access_config.to_dict()
         )
+    def precheck(self) -> None:
+        try:
+            assert self.client.current_user.me().active
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
         for content in contents:
             with open(content.path, "rb") as elements_file:

unstructured_ingest/v2/processes/connectors/elasticsearch.py CHANGED Viewed

@@ -7,10 +7,12 @@ from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, Any, Generator, Optional
-from unstructured.documents.elements import DataSourceMetadata
 from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
-from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
 from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     UploadContent,
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
 class ElasticsearchIndexer(Indexer):
     connection_config: ElasticsearchConnectionConfig
     index_config: ElasticsearchIndexerConfig
-    client: "ElasticsearchClient" = field(init=False)
     connector_type: str = CONNECTOR_TYPE
-    def __post_init__(self):
-        self.client = self.connection_config.get_client()
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["elasticsearch"], extras="elasticsearch")
     def load_scan(self):
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
         scan = self.load_scan()
         scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
+        client = self.connection_config.get_client()
         hits = scan(
-            self.client,
+            client,
             query=scan_query,
             scroll="1m",
             index=self.index_config.index_name,
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
             yield FileData(
                 identifier=identified,
                 connector_type=CONNECTOR_TYPE,
-                metadata=DataSourceMetadata(
+                metadata=FileDataSourceMetadata(
                     url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
                     date_processed=str(time()),
                 ),
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
             file_data=FileData(
                 identifier=filename_id,
                 connector_type=CONNECTOR_TYPE,
-                metadata=DataSourceMetadata(
+                metadata=FileDataSourceMetadata(
                     version=str(result["_version"]) if "_version" in result else None,
                     date_processed=str(time()),
                     record_locator={
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
     upload_config: ElasticsearchUploaderConfig
     connection_config: ElasticsearchConnectionConfig
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["elasticsearch"], extras="elasticsearch")
     def load_parallel_bulk(self):
         from elasticsearch.helpers import parallel_bulk

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -1,14 +1,12 @@
 from __future__ import annotations
 import contextlib
-import fnmatch
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
-from unstructured.documents.elements import DataSourceMetadata
+from uuid import NAMESPACE_DNS, uuid5
 from unstructured_ingest.enhanced_dataclass import enhanced_field
 from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -19,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -73,7 +72,6 @@ class FileConfig(Base):
 @dataclass
 class FsspecIndexerConfig(FileConfig, IndexerConfig):
     recursive: bool = False
-    file_glob: Optional[list[str]] = None
 @dataclass
@@ -108,17 +106,7 @@ class FsspecIndexer(Indexer):
             **self.connection_config.get_access_config(),
         )
-    def does_path_match_glob(self, path: str) -> bool:
-        if self.index_config.file_glob is None:
-            return True
-        patterns = self.index_config.file_glob
-        for pattern in patterns:
-            if fnmatch.filter([path], pattern):
-                return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
-        return False
-    def check_connection(self):
+    def precheck(self) -> None:
         from fsspec import get_filesystem_class
         try:
@@ -156,10 +144,10 @@ class FsspecIndexer(Indexer):
             else:
                 raise TypeError(f"unhandled response type from find: {type(found)}")
-    def get_metadata(self, path: str) -> DataSourceMetadata:
+    def get_metadata(self, path: str) -> FileDataSourceMetadata:
         date_created = None
         date_modified = None
+        file_size = None
         try:
             created: Optional[Any] = self.fs.created(path)
             if created:
@@ -179,6 +167,8 @@ class FsspecIndexer(Indexer):
                     date_modified = str(modified)
         except NotImplementedError:
             pass
+        with contextlib.suppress(AttributeError):
+            file_size = self.fs.size(path)
         version = self.fs.checksum(path)
         metadata: dict[str, str] = {}
@@ -188,15 +178,19 @@ class FsspecIndexer(Indexer):
             "protocol": self.index_config.protocol,
             "remote_file_path": self.index_config.remote_url,
         }
+        file_stat = self.fs.stat(path=path)
+        if file_id := file_stat.get("id"):
+            record_locator["file_id"] = file_id
         if metadata:
             record_locator["metadata"] = metadata
-        return DataSourceMetadata(
+        return FileDataSourceMetadata(
             date_created=date_created,
             date_modified=date_modified,
             date_processed=str(time()),
             version=str(version),
             url=f"{self.index_config.protocol}://{path}",
             record_locator=record_locator,
+            filesize_bytes=file_size,
         )
     def sterilize_info(self, path) -> dict:
@@ -204,14 +198,16 @@ class FsspecIndexer(Indexer):
         return sterilize_dict(data=info)
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
-        raw_files = self.list_files()
-        files = [f for f in raw_files if self.does_path_match_glob(f)]
+        files = self.list_files()
         for file in files:
             # Note: we remove any remaining leading slashes (Box introduces these)
             # to get a valid relative path
             rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
+            additional_metadata = self.sterilize_info(path=file)
+            additional_metadata["original_file_path"] = file
             yield FileData(
-                identifier=file,
+                identifier=str(uuid5(NAMESPACE_DNS, file)),
                 connector_type=self.connector_type,
                 source_identifiers=SourceIdentifiers(
                     filename=Path(file).name,
@@ -219,7 +215,7 @@ class FsspecIndexer(Indexer):
                     fullpath=file,
                 ),
                 metadata=self.get_metadata(path=file),
-                additional_metadata=self.sterilize_info(path=file),
+                additional_metadata=additional_metadata,
             )
@@ -251,18 +247,12 @@ class FsspecDownloader(Downloader):
             **self.connection_config.get_access_config(),
         )
-    def get_download_path(self, file_data: FileData) -> Path:
-        return (
-            self.download_dir / Path(file_data.source_identifiers.relative_path)
-            if self.download_config
-            else Path(file_data.source_identifiers.rel_path)
-        )
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         try:
-            self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
+            rpath = file_data.additional_metadata["original_file_path"]
+            self.fs.get(rpath=rpath, lpath=download_path.as_posix())
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -272,7 +262,8 @@ class FsspecDownloader(Downloader):
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         try:
-            await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
+            rpath = file_data.additional_metadata["original_file_path"]
+            await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")

unstructured_ingest/v2/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -5,11 +5,15 @@ from pathlib import Path
 from time import time
 from typing import Any, Generator, Optional
-from unstructured.documents.elements import DataSourceMetadata
+from unstructured.utils import requires_dependencies
 from unstructured_ingest.enhanced_dataclass import enhanced_field
-from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import (
+    DownloadResponse,
+    FileData,
+    FileDataSourceMetadata,
+    UploadContent,
+)
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
     index_config: S3IndexerConfig
     connector_type: str = CONNECTOR_TYPE
-    def get_metadata(self, path: str) -> DataSourceMetadata:
+    def get_metadata(self, path: str) -> FileDataSourceMetadata:
         date_created = None
         date_modified = None
+        file_size = None
         try:
             modified: Optional[datetime] = self.fs.modified(path)
             if modified:
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
                 date_modified = str(modified.timestamp())
         except NotImplementedError:
             pass
+        with contextlib.suppress(AttributeError):
+            file_size = self.fs.size(path)
         version = None
         info: dict[str, Any] = self.fs.info(path)
@@ -90,13 +97,14 @@ class S3Indexer(FsspecIndexer):
         }
         if metadata:
             record_locator["metadata"] = metadata
-        return DataSourceMetadata(
+        return FileDataSourceMetadata(
             date_created=date_created,
             date_modified=date_modified,
             date_processed=str(time()),
             version=version,
             url=f"{self.index_config.protocol}://{path}",
             record_locator=record_locator,
+            filesize_bytes=file_size,
         )
     @requires_dependencies(["s3fs", "fsspec"], extras="s3")

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import io
 import os
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, Union
 from dateutil import parser
-from unstructured.documents.elements import DataSourceMetadata
 from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
 from unstructured_ingest.enhanced_dataclass import enhanced_field
-from unstructured_ingest.error import SourceConnectionNetworkError
+from unstructured_ingest.error import (
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.string_and_date_utils import json_to_dict
 from unstructured_ingest.v2.interfaces import (
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
     Downloader,
     DownloaderConfig,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
         ]
     )
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_files_service()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
     @staticmethod
     def is_dir(record: dict) -> bool:
         return record.get("mimeType") == "application/vnd.google-apps.folder"
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
             connector_type=CONNECTOR_TYPE,
             identifier=file_id,
             source_identifiers=source_identifiers,
-            metadata=DataSourceMetadata(
+            metadata=FileDataSourceMetadata(
                 url=url,
                 version=version,
                 date_created=str(date_created_dt.timestamp()),
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
     )
     connector_type: str = CONNECTOR_TYPE
-    def get_download_path(self, file_data: FileData) -> Path:
-        rel_path = file_data.source_identifiers.relative_path
-        rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
-        return self.download_dir / Path(rel_path)
     @SourceConnectionNetworkError.wrap
     def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
         downloaded = False

unstructured_ingest/v2/processes/connectors/local.py CHANGED Viewed

@@ -1,12 +1,9 @@
 import glob
-import itertools
 import shutil
 from dataclasses import dataclass, field
 from pathlib import Path
 from time import time
-from typing import Any, Generator, Optional
-from unstructured.documents.elements import DataSourceMetadata
+from typing import Any, Generator
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
     DownloaderConfig,
     DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
 class LocalIndexerConfig(IndexerConfig):
     input_path: str
     recursive: bool = False
-    file_glob: Optional[list[str]] = None
     @property
     def path(self) -> Path:
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
         input_path = self.index_config.path
         if input_path.is_file():
             return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
-        glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
-        if not self.index_config.file_glob:
-            return list(glob_fn("*"))
-        return list(
-            itertools.chain.from_iterable(
-                glob_fn(pattern) for pattern in self.index_config.file_glob
-            )
-        )
+        if self.index_config.recursive:
+            return list(input_path.rglob("*"))
+        return list(input_path.glob("*"))
-    def get_file_metadata(self, path: Path) -> DataSourceMetadata:
+    def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
         stats = path.stat()
         try:
             date_modified = str(stats.st_mtime)
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
         except Exception as e:
             logger.warning(f"Couldn't detect file mode: {e}")
             permissions_data = None
-        return DataSourceMetadata(
+        try:
+            filesize_bytes = stats.st_size
+        except Exception as e:
+            logger.warning(f"Couldn't detect file size: {e}")
+            filesize_bytes = None
+        return FileDataSourceMetadata(
             date_modified=date_modified,
             date_created=date_created,
             date_processed=str(time()),
             permissions_data=permissions_data,
             record_locator={"path": str(path.resolve())},
+            filesize_bytes=filesize_bytes,
         )
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:

unstructured-ingest 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.0py3-none-any.whl → 0.0.2py3-none-any.whl