PyPI - unstructured-ingest - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

unstructured-ingest 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (37) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.1" # pragma: no cover
1	+ __version__ = "0.0.2" # pragma: no cover

unstructured_ingest/v2/cli/base/cmd.py CHANGED Viewed

@@ -24,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
+from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
 from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
 CommandT = TypeVar("CommandT", bound=click.Command)
@@ -75,6 +76,8 @@ class BaseCmd(ABC):
         }
         if chunker := self.get_chunker(options=source_options):
             pipeline_kwargs["chunker"] = chunker
+        if filterer := self.get_filterer(options=source_options):
+            pipeline_kwargs["filterer"] = filterer
         if embedder := self.get_embeder(options=source_options):
             pipeline_kwargs["embedder"] = embedder
         if dest:
@@ -105,6 +108,13 @@ class BaseCmd(ABC):
             return None
         return Chunker(config=chunker_config)
+    @staticmethod
+    def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
+        filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
+        if not filterer_configs.to_dict():
+            return None
+        return Filterer(config=filterer_configs)
     @staticmethod
     def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
         embedder_config = extract_config(flat_data=options, config=EmbedderConfig)

unstructured_ingest/v2/cli/base/src.py CHANGED Viewed

@@ -8,6 +8,7 @@ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
 from unstructured_ingest.v2.cli.configs import (
     ChunkerCliConfig,
     EmbedderCliConfig,
+    FilterCliConfig,
     PartitionerCliConfig,
     ProcessorCliConfig,
 )
@@ -26,6 +27,7 @@ class SrcCmd(BaseCmd):
             ProcessorCliConfig,
             PartitionerCliConfig,
             EmbedderCliConfig,
+            FilterCliConfig,
             ChunkerCliConfig,
         ]
     )

unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py CHANGED Viewed

@@ -3,7 +3,6 @@ from dataclasses import dataclass
 import click
 from unstructured_ingest.v2.cli.interfaces import CliConfig
-from unstructured_ingest.v2.cli.utils import DelimitedString
 @dataclass
@@ -14,7 +13,7 @@ class FsspecCliDownloadConfig(CliConfig):
             click.Option(
                 ["--download-dir"],
                 help="Where files are downloaded to, defaults to a location at"
-                "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
+                "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
             ),
         ]
@@ -65,13 +64,6 @@ class FsspecCliIndexerConfig(FsspecCliFileConfig):
                     help="Recursively download files in their respective folders "
                     "otherwise stop at the files in provided folder level.",
                 ),
-                click.Option(
-                    ["--file-glob"],
-                    default=None,
-                    type=DelimitedString(),
-                    help="A comma-separated list of file globs to limit which types of "
-                    "local files are accepted, e.g. '*.html,*.txt'",
-                ),
             ]
         )
         return options

unstructured_ingest/v2/cli/cmds/local.py CHANGED Viewed

@@ -4,7 +4,6 @@ import click
 from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
 from unstructured_ingest.v2.cli.interfaces import CliConfig
-from unstructured_ingest.v2.cli.utils import DelimitedString
 from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
@@ -19,13 +18,6 @@ class LocalCliIndexerConfig(CliConfig):
                 type=click.Path(file_okay=True, dir_okay=True, exists=True),
                 help="Path to the location in the local file system that will be processed.",
             ),
-            click.Option(
-                ["--file-glob"],
-                default=None,
-                type=DelimitedString(),
-                help="A comma-separated list of file globs to limit which types of "
-                "local files are accepted, e.g. '*.html,*.txt'",
-            ),
             click.Option(
                 ["--recursive"],
                 is_flag=True,

unstructured_ingest/v2/cli/configs/__init__.py CHANGED Viewed

@@ -1,6 +1,13 @@
 from .chunk import ChunkerCliConfig
 from .embed import EmbedderCliConfig
+from .filter import FilterCliConfig
 from .partition import PartitionerCliConfig
 from .processor import ProcessorCliConfig
-__all__ = ["ChunkerCliConfig", "ProcessorCliConfig", "PartitionerCliConfig", "EmbedderCliConfig"]
+__all__ = [
+    "ChunkerCliConfig",
+    "ProcessorCliConfig",
+    "PartitionerCliConfig",
+    "EmbedderCliConfig",
+    "FilterCliConfig",
+]

unstructured_ingest/v2/cli/configs/filter.py ADDED Viewed

@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+import click
+from unstructured_ingest.v2.cli.interfaces import CliConfig
+from unstructured_ingest.v2.cli.utils import DelimitedString
+@dataclass
+class FilterCliConfig(CliConfig):
+    @staticmethod
+    def get_cli_options() -> list[click.Option]:
+        options = [
+            click.Option(
+                ["--file-glob"],
+                default=None,
+                type=DelimitedString(),
+                help="A comma-separated list of file globs to limit which types of "
+                "local files are accepted, e.g. '*.html,*.txt'",
+            ),
+            click.Option(
+                ["--max-file-size"],
+                default=None,
+                type=click.IntRange(min=1),
+                help="Max file size to process in bytes",
+            ),
+        ]
+        return options

unstructured_ingest/v2/interfaces/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .connector import AccessConfig, BaseConnector, ConnectionConfig
 from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
-from .file_data import FileData, SourceIdentifiers
+from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
 from .indexer import Indexer, IndexerConfig
 from .process import BaseProcess
 from .processor import ProcessorConfig
@@ -26,4 +26,5 @@ __all__ = [
     "AccessConfig",
     "ConnectionConfig",
     "BaseConnector",
+    "FileDataSourceMetadata",
 ]

unstructured_ingest/v2/interfaces/downloader.py CHANGED Viewed

@@ -30,6 +30,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
     connector_type: str
     download_config: DownloaderConfigT
+    def get_download_path(self, file_data: FileData) -> Optional[Path]:
+        if not file_data.source_identifiers:
+            return None
+        rel_path = file_data.source_identifiers.relative_path
+        if not rel_path:
+            return None
+        rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
+        return self.download_dir / Path(rel_path)
     @staticmethod
     def is_float(value: str):
         try:
@@ -68,9 +77,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
     def is_async(self) -> bool:
         return True
-    def get_download_path(self, file_data: FileData) -> Optional[Path]:
-        return None
     @abstractmethod
     def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
         pass

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -22,13 +22,18 @@ class SourceIdentifiers:
         return self.rel_path or self.fullpath
+@dataclass
+class FileDataSourceMetadata(DataSourceMetadata):
+    filesize_bytes: Optional[int] = None
 @dataclass
 class FileData(DataClassJsonMixin):
     identifier: str
     connector_type: str
     source_identifiers: Optional[SourceIdentifiers] = None
     doc_type: Literal["file", "batch"] = field(default="file")
-    metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
+    metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
     additional_metadata: dict[str, Any] = field(default_factory=dict)
     reprocess: bool = False

unstructured_ingest/v2/interfaces/process.py CHANGED Viewed

@@ -8,6 +8,9 @@ class BaseProcess(ABC):
     def is_async(self) -> bool:
         return False
+    def precheck(self) -> None:
+        pass
     @abstractmethod
     def run(self, **kwargs: Any) -> Any:
         pass

unstructured_ingest/v2/pipeline/interfaces.py CHANGED Viewed

@@ -92,7 +92,7 @@ class PipelineStep(ABC):
         if iterable:
             if len(iterable) == 1:
-                return [self.process_serially(iterable)]
+                return self.process_serially(iterable)
             if self.context.num_processes == 1:
                 return self.process_serially(iterable)
             with mp.Pool(
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
             logger.info(
                 f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
             )
+        else:
+            logger.info(f"Calling {self.__class__.__name__} with no inputs")
         if self.context.async_supported and self.process.is_async():
             return self.process_async(iterable=iterable)
         if self.context.mp_supported:
@@ -146,8 +148,6 @@ class PipelineStep(ABC):
             logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
             if "file_data_path" in kwargs:
                 self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
-            else:
-                self.context.status[self.identifier] = {"step_error": str(e)}
             if self.context.raise_on_error:
                 raise e
             return None
@@ -160,8 +160,6 @@ class PipelineStep(ABC):
             logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
             if "file_data_path" in kwargs:
                 self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
-            else:
-                self.context.status[self.identifier] = {"step_error": str(e)}
             if self.context.raise_on_error:
                 raise e
             return None

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -9,6 +9,7 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
 from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
 from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
 from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
+from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
 from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
 from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
 from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
@@ -27,6 +28,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 from unstructured_ingest.v2.processes.connectors.local import LocalUploader
 from unstructured_ingest.v2.processes.embedder import EmbedderConfig
+from unstructured_ingest.v2.processes.filter import FiltererConfig
 from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
@@ -37,22 +39,33 @@ class PipelineError(Exception):
 @dataclass
 class Pipeline:
     context: ProcessorConfig
     indexer: InitVar[IndexerT]
     indexer_step: IndexStep = field(init=False)
     downloader: InitVar[DownloaderT]
     downloader_step: DownloadStep = field(init=False)
     partitioner: InitVar[Partitioner]
     partitioner_step: PartitionStep = field(init=False)
     chunker: InitVar[Optional[Chunker]] = None
     chunker_step: ChunkStep = field(init=False, default=None)
     embedder: InitVar[Optional[Embedder]] = None
     embedder_step: EmbedStep = field(init=False, default=None)
     stager: InitVar[Optional[UploadStager]] = None
     stager_step: UploadStageStep = field(init=False, default=None)
     uploader: InitVar[Uploader] = field(default=LocalUploader())
     uploader_step: UploadStep = field(init=False, default=None)
     uncompress_step: UncompressStep = field(init=False, default=None)
+    filterer: InitVar[Optional[Filterer]] = None
+    filter_step: FilterStep = field(init=False, default=None)
     def __post_init__(
         self,
         indexer: IndexerT,
@@ -62,10 +75,12 @@ class Pipeline:
         embedder: Embedder = None,
         stager: UploadStager = None,
         uploader: Uploader = None,
+        filterer: Filterer = None,
     ):
         make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
         self.indexer_step = IndexStep(process=indexer, context=self.context)
         self.downloader_step = DownloadStep(process=downloader, context=self.context)
+        self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
         self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
         self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
@@ -109,6 +124,7 @@ class Pipeline:
     def run(self):
         try:
             start_time = time()
+            self._run_prechecks()
             self._run()
             logger.info(f"Finished ingest process in {time() - start_time}s")
         finally:
@@ -130,6 +146,37 @@ class Pipeline:
         final = [f for f in flat if f]
         return final or None
+    def _run_prechecks(self):
+        steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
+        if self.chunker_step:
+            steps.append(self.chunker_step)
+        if self.embedder_step:
+            steps.append(self.embedder_step)
+        if self.uncompress_step:
+            steps.append(self.uncompress_step)
+        if self.stager_step:
+            steps.append(self.stager_step)
+        failures = {}
+        for step in steps:
+            try:
+                step.process.precheck()
+            except Exception as e:
+                failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
+        if failures:
+            for k, v in failures.items():
+                logger.error(f"Step precheck failure: {k}: {v}")
+            raise PipelineError("Precheck failed")
+    def apply_filter(self, records: list[dict]) -> list[dict]:
+        if not self.filter_step:
+            return records
+        data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
+        filtered_data = self.filter_step(data_to_filter)
+        filtered_data = [f for f in filtered_data if f is not None]
+        filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
+        filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
+        return filtered_records
     def _run(self):
         logger.info(
             f"Running local pipline: {self} with configs: "
@@ -147,18 +194,33 @@ class Pipeline:
         if not indices_inputs:
             return
+        # Initial filtering on indexed content
+        indices_inputs = self.apply_filter(records=indices_inputs)
+        if not indices_inputs:
+            return
         # Download associated content to local file system
         downloaded_data = self.downloader_step(indices_inputs)
         downloaded_data = self.clean_results(results=downloaded_data)
         if not downloaded_data:
             return
+        # Post download filtering
+        downloaded_data = self.apply_filter(records=downloaded_data)
+        if not downloaded_data:
+            return
         # Run uncompress if available
         if self.uncompress_step:
             downloaded_data = self.uncompress_step(downloaded_data)
             # Flatten list of lists
             downloaded_data = self.clean_results(results=downloaded_data)
+            # Post uncompress filtering
+            downloaded_data = self.apply_filter(records=downloaded_data)
+            if not downloaded_data:
+                return
         if not downloaded_data:
             return
@@ -179,9 +241,14 @@ class Pipeline:
         self.uploader_step(iterable=elements)
     def __str__(self):
-        s = [str(self.indexer_step), str(self.downloader_step)]
+        s = [str(self.indexer_step)]
+        if filter_step := self.filter_step:
+            s.append(str(filter_step))
+        s.append(str(self.downloader_step))
+        if filter_step := self.filter_step:
+            s.append(str(filter_step))
         if uncompress_step := self.uncompress_step:
-            s.append(str(uncompress_step))
+            s.extend([str(uncompress_step), str(filter_step)])
         s.append(str(self.partitioner_step))
         if chunker_step := self.chunker_step:
             s.append(str(chunker_step))
@@ -200,6 +267,7 @@ class Pipeline:
         downloader_config: DownloaderConfigT,
         source_connection_config: ConnectionConfig,
         partitioner_config: PartitionerConfig,
+        filterer_config: FiltererConfig = None,
         chunker_config: Optional[ChunkerConfig] = None,
         embedder_config: Optional[EmbedderConfig] = None,
         destination_connection_config: Optional[ConnectionConfig] = None,
@@ -235,6 +303,8 @@ class Pipeline:
             ),
             "partitioner": Partitioner(config=partitioner_config),
         }
+        if filterer_config:
+            pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
         if chunker_config:
             pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
         if embedder_config:

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import hashlib
 import json
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Callable, Optional, TypedDict, TypeVar
 from unstructured_ingest.v2.interfaces import FileData, download_responses
@@ -70,11 +71,40 @@ class DownloadStep(PipelineStep):
             return True
         return False
+    def update_file_data(
+        self, file_data: FileData, file_data_path: Path, download_path: Path
+    ) -> None:
+        file_size_bytes = download_path.stat().st_size
+        changed = False
+        if not file_data.metadata.filesize_bytes and file_size_bytes:
+            changed = True
+            file_data.metadata.filesize_bytes = file_size_bytes
+        if (
+            file_data.metadata.filesize_bytes
+            and file_data.metadata.filesize_bytes != file_size_bytes
+        ):
+            logger.warning(
+                f"file size in original file data "
+                f"({file_data.metadata.filesize_bytes}) doesn't "
+                f"match size of local file: {file_size_bytes}, updating"
+            )
+            changed = True
+            file_data.metadata.filesize_bytes = file_size_bytes
+        if changed:
+            logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
+            with file_data_path.open("w") as file:
+                json.dump(file_data.to_dict(), file, indent=2)
     async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
         file_data = FileData.from_file(path=file_data_path)
         download_path = self.process.get_download_path(file_data=file_data)
         if not self.should_download(file_data=file_data, file_data_path=file_data_path):
             logger.debug(f"Skipping download, file already exists locally: {download_path}")
+            self.update_file_data(
+                file_data=file_data,
+                file_data_path=Path(file_data_path),
+                download_path=download_path,
+            )
             return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
         fn_kwargs = {"file_data": file_data}
         if not asyncio.iscoroutinefunction(fn):
@@ -85,26 +115,60 @@ class DownloadStep(PipelineStep):
         else:
             download_results = await fn(**fn_kwargs)
         return self.create_step_results(
-            current_file_data_path=file_data_path, download_results=download_results
+            current_file_data_path=file_data_path,
+            download_results=download_results,
+            current_file_data=file_data,
         )
     def create_step_results(
-        self, current_file_data_path: str, download_results: download_responses
+        self,
+        current_file_data_path: str,
+        current_file_data: FileData,
+        download_results: download_responses,
     ) -> list[DownloadStepResponse]:
+        responses = []
         if not isinstance(download_results, list):
-            return [
-                DownloadStepResponse(
-                    file_data_path=current_file_data_path, path=str(download_results["path"])
+            file_data = current_file_data
+            file_data_path = current_file_data_path
+            download_path = download_results["path"]
+            if download_results["file_data"].identifier == current_file_data.identifier:
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
+                )
+                responses = [
+                    DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
+                ]
+            else:
+                file_data = download_results["file_data"]
+                file_data_path = self.persist_new_file_data(file_data=file_data)
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
                 )
-            ]
+                responses = [
+                    DownloadStepResponse(
+                        file_data_path=current_file_data_path, path=str(download_results["path"])
+                    )
+                ]
+        else:
             # Supplemental results generated as part of the download process
-        download_step_results = []
-        for res in download_results:
-            file_data_path = self.persist_new_file_data(file_data=res["file_data"])
-            download_step_results.append(
-                DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
-            )
-        return download_step_results
+            for res in download_results:
+                file_data = res["file_data"]
+                file_data_path = self.persist_new_file_data(file_data=file_data)
+                download_path = res["path"]
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
+                )
+                responses.append(
+                    DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
+                )
+        return responses
     def persist_new_file_data(self, file_data: FileData) -> str:
         record_hash = self.get_hash(extras=[file_data.identifier])

unstructured_ingest/v2/pipeline/steps/filter.py ADDED Viewed

@@ -0,0 +1,40 @@
+import asyncio
+from dataclasses import dataclass
+from typing import Callable, Optional
+from unstructured_ingest.v2.interfaces.file_data import FileData
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
+from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.processes.filter import Filterer
+STEP_ID = "filter"
+@dataclass
+class FilterStep(PipelineStep):
+    process: Filterer
+    identifier: str = STEP_ID
+    def __post_init__(self):
+        config = (
+            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
+            if self.process.config
+            else None
+        )
+        logger.info(f"Created {self.identifier} with configs: {config}")
+    async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
+        file_data = FileData.from_file(path=file_data_path)
+        fn_kwargs = {"file_data": file_data}
+        if not asyncio.iscoroutinefunction(fn):
+            resp = fn(**fn_kwargs)
+        elif semaphore := self.context.semaphore:
+            async with semaphore:
+                resp = await fn(**fn_kwargs)
+        else:
+            resp = await fn(**fn_kwargs)
+        if resp:
+            return {"file_data_path": file_data_path}
+        return None

unstructured_ingest/v2/processes/connectors/astra.py CHANGED Viewed

@@ -7,6 +7,7 @@ from unstructured import __name__ as integration_name
 from unstructured.__version__ import __version__ as integration_version
 from unstructured_ingest.enhanced_dataclass import enhanced_field
+from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
@@ -94,6 +95,13 @@ class AstraUploader(Uploader):
     upload_config: AstraUploaderConfig
     connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        try:
+            self.get_collection()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["astrapy"], extras="astra")
     def get_collection(self) -> "AstraDBCollection":
         from astrapy.db import AstraDB

unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py CHANGED Viewed

@@ -175,6 +175,14 @@ class AzureCognitiveSearchUploader(Uploader):
                 ),
             )
+    def precheck(self) -> None:
+        try:
+            client = self.connection_config.generate_client()
+            client.get_document_count()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     def write_dict_wrapper(self, elements_dict):
         return self.write_dict(elements_dict=elements_dict)

unstructured_ingest/v2/processes/connectors/chroma.py CHANGED Viewed

@@ -111,10 +111,13 @@ class ChromaUploader(Uploader):
     connector_type: str = CONNECTOR_TYPE
     upload_config: ChromaUploaderConfig
     connection_config: ChromaConnectionConfig
-    client: Optional["Client"] = field(init=False)
-    def __post_init__(self):
-        self.client = self.create_client()
+    def precheck(self) -> None:
+        try:
+            self.create_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["chromadb"], extras="chroma")
     def create_client(self) -> "Client":
@@ -187,10 +190,9 @@ class ChromaUploader(Uploader):
             f"collection {self.connection_config.collection_name} "
             f"at {self.connection_config.host}",
         )
+        client = self.create_client()
-        collection = self.client.get_or_create_collection(
-            name=self.connection_config.collection_name
-        )
+        collection = client.get_or_create_collection(name=self.connection_config.collection_name)
         for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
             self.upsert_batch(collection, self.prepare_chroma_list(chunk))

unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl