PyPI - unstructured-ingest - Versions diffs - 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

unstructured-ingest 0.0.2.dev0py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/cli.py +6 -1
unstructured_ingest/cli/cmds/__init__.py +4 -4
unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
unstructured_ingest/cli/interfaces.py +13 -6
unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
unstructured_ingest/connector/biomed.py +12 -5
unstructured_ingest/connector/confluence.py +3 -3
unstructured_ingest/connector/github.py +3 -2
unstructured_ingest/connector/google_drive.py +1 -2
unstructured_ingest/connector/mongodb.py +1 -2
unstructured_ingest/connector/notion/client.py +31 -16
unstructured_ingest/connector/notion/connector.py +3 -2
unstructured_ingest/connector/registry.py +2 -2
unstructured_ingest/connector/vectara.py +7 -2
unstructured_ingest/interfaces.py +13 -9
unstructured_ingest/pipeline/interfaces.py +8 -3
unstructured_ingest/pipeline/reformat/chunking.py +13 -9
unstructured_ingest/pipeline/reformat/embedding.py +3 -3
unstructured_ingest/runner/__init__.py +2 -2
unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
unstructured_ingest/runner/writers/__init__.py +2 -2
unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
unstructured_ingest/utils/chunking.py +45 -0
unstructured_ingest/utils/dep_check.py +1 -1
unstructured_ingest/utils/google_filetype.py +9 -0
unstructured_ingest/v2/cli/base/cmd.py +66 -12
unstructured_ingest/v2/cli/base/dest.py +21 -12
unstructured_ingest/v2/cli/base/src.py +35 -21
unstructured_ingest/v2/cli/cmds.py +14 -0
unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
unstructured_ingest/v2/interfaces/__init__.py +2 -1
unstructured_ingest/v2/interfaces/connector.py +5 -7
unstructured_ingest/v2/interfaces/downloader.py +17 -8
unstructured_ingest/v2/interfaces/file_data.py +13 -2
unstructured_ingest/v2/interfaces/indexer.py +3 -4
unstructured_ingest/v2/interfaces/process.py +3 -4
unstructured_ingest/v2/interfaces/processor.py +10 -10
unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
unstructured_ingest/v2/interfaces/uploader.py +3 -3
unstructured_ingest/v2/pipeline/interfaces.py +3 -5
unstructured_ingest/v2/pipeline/pipeline.py +73 -7
unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
unstructured_ingest/v2/pipeline/steps/download.py +90 -24
unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
unstructured_ingest/v2/pipeline/steps/index.py +14 -10
unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
unstructured_ingest/v2/processes/__init__.py +18 -0
unstructured_ingest/v2/processes/chunker.py +74 -28
unstructured_ingest/v2/processes/connector_registry.py +8 -2
unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
unstructured_ingest/v2/processes/connectors/local.py +36 -28
unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
unstructured_ingest/v2/processes/connectors/sql.py +52 -39
unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
unstructured_ingest/v2/processes/embedder.py +106 -47
unstructured_ingest/v2/processes/filter.py +60 -0
unstructured_ingest/v2/processes/partitioner.py +79 -33
unstructured_ingest/v2/processes/uncompress.py +3 -3
unstructured_ingest/v2/utils.py +45 -0
unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
{unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
{unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
unstructured_ingest/v2/cli/cmds/astra.py +0 -85
unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
unstructured_ingest/v2/cli/cmds/local.py +0 -60
unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
unstructured_ingest/v2/cli/cmds/sql.py +0 -84
unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
unstructured_ingest/v2/cli/configs/__init__.py +0 -6
unstructured_ingest/v2/cli/configs/chunk.py +0 -89
unstructured_ingest/v2/cli/configs/embed.py +0 -74
unstructured_ingest/v2/cli/configs/partition.py +0 -99
unstructured_ingest/v2/cli/configs/processor.py +0 -88
unstructured_ingest/v2/cli/interfaces.py +0 -27
unstructured_ingest/v2/pipeline/utils.py +0 -15
unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
/unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
{unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -9,12 +9,12 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
 from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
 from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
 from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
+from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
 from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
 from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
 from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
 from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
 from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.chunker import ChunkerConfig
 from unstructured_ingest.v2.processes.connector_registry import (
     ConnectionConfig,
@@ -27,6 +27,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 from unstructured_ingest.v2.processes.connectors.local import LocalUploader
 from unstructured_ingest.v2.processes.embedder import EmbedderConfig
+from unstructured_ingest.v2.processes.filter import FiltererConfig
 from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
@@ -37,22 +38,33 @@ class PipelineError(Exception):
 @dataclass
 class Pipeline:
     context: ProcessorConfig
     indexer: InitVar[IndexerT]
     indexer_step: IndexStep = field(init=False)
     downloader: InitVar[DownloaderT]
     downloader_step: DownloadStep = field(init=False)
     partitioner: InitVar[Partitioner]
     partitioner_step: PartitionStep = field(init=False)
     chunker: InitVar[Optional[Chunker]] = None
     chunker_step: ChunkStep = field(init=False, default=None)
     embedder: InitVar[Optional[Embedder]] = None
     embedder_step: EmbedStep = field(init=False, default=None)
     stager: InitVar[Optional[UploadStager]] = None
     stager_step: UploadStageStep = field(init=False, default=None)
     uploader: InitVar[Uploader] = field(default=LocalUploader())
     uploader_step: UploadStep = field(init=False, default=None)
     uncompress_step: UncompressStep = field(init=False, default=None)
+    filterer: InitVar[Optional[Filterer]] = None
+    filter_step: FilterStep = field(init=False, default=None)
     def __post_init__(
         self,
         indexer: IndexerT,
@@ -62,10 +74,12 @@ class Pipeline:
         embedder: Embedder = None,
         stager: UploadStager = None,
         uploader: Uploader = None,
+        filterer: Filterer = None,
     ):
         make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
         self.indexer_step = IndexStep(process=indexer, context=self.context)
         self.downloader_step = DownloadStep(process=downloader, context=self.context)
+        self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
         self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
         self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
@@ -109,6 +123,7 @@ class Pipeline:
     def run(self):
         try:
             start_time = time()
+            self._run_prechecks()
             self._run()
             logger.info(f"Finished ingest process in {time() - start_time}s")
         finally:
@@ -130,11 +145,39 @@ class Pipeline:
         final = [f for f in flat if f]
         return final or None
+    def _run_prechecks(self):
+        steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
+        if self.chunker_step:
+            steps.append(self.chunker_step)
+        if self.embedder_step:
+            steps.append(self.embedder_step)
+        if self.uncompress_step:
+            steps.append(self.uncompress_step)
+        if self.stager_step:
+            steps.append(self.stager_step)
+        failures = {}
+        for step in steps:
+            try:
+                step.process.precheck()
+            except Exception as e:
+                failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
+        if failures:
+            for k, v in failures.items():
+                logger.error(f"Step precheck failure: {k}: {v}")
+            raise PipelineError("Precheck failed")
+    def apply_filter(self, records: list[dict]) -> list[dict]:
+        if not self.filter_step:
+            return records
+        data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
+        filtered_data = self.filter_step(data_to_filter)
+        filtered_data = [f for f in filtered_data if f is not None]
+        filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
+        filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
+        return filtered_records
     def _run(self):
-        logger.info(
-            f"Running local pipline: {self} with configs: "
-            f"{sterilize_dict(self.context.to_dict(redact_sensitive=True))}"
-        )
+        logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
         if self.context.mp_supported:
             manager = mp.Manager()
             self.context.status = manager.dict()
@@ -147,18 +190,33 @@ class Pipeline:
         if not indices_inputs:
             return
+        # Initial filtering on indexed content
+        indices_inputs = self.apply_filter(records=indices_inputs)
+        if not indices_inputs:
+            return
         # Download associated content to local file system
         downloaded_data = self.downloader_step(indices_inputs)
         downloaded_data = self.clean_results(results=downloaded_data)
         if not downloaded_data:
             return
+        # Post download filtering
+        downloaded_data = self.apply_filter(records=downloaded_data)
+        if not downloaded_data:
+            return
         # Run uncompress if available
         if self.uncompress_step:
             downloaded_data = self.uncompress_step(downloaded_data)
             # Flatten list of lists
             downloaded_data = self.clean_results(results=downloaded_data)
+            # Post uncompress filtering
+            downloaded_data = self.apply_filter(records=downloaded_data)
+            if not downloaded_data:
+                return
         if not downloaded_data:
             return
@@ -179,9 +237,14 @@ class Pipeline:
         self.uploader_step(iterable=elements)
     def __str__(self):
-        s = [str(self.indexer_step), str(self.downloader_step)]
+        s = [str(self.indexer_step)]
+        if filter_step := self.filter_step:
+            s.append(str(filter_step))
+        s.append(str(self.downloader_step))
+        if filter_step := self.filter_step:
+            s.append(str(filter_step))
         if uncompress_step := self.uncompress_step:
-            s.append(str(uncompress_step))
+            s.extend([str(uncompress_step), str(filter_step)])
         s.append(str(self.partitioner_step))
         if chunker_step := self.chunker_step:
             s.append(str(chunker_step))
@@ -200,6 +263,7 @@ class Pipeline:
         downloader_config: DownloaderConfigT,
         source_connection_config: ConnectionConfig,
         partitioner_config: PartitionerConfig,
+        filterer_config: FiltererConfig = None,
         chunker_config: Optional[ChunkerConfig] = None,
         embedder_config: Optional[EmbedderConfig] = None,
         destination_connection_config: Optional[ConnectionConfig] = None,
@@ -235,6 +299,8 @@ class Pipeline:
             ),
             "partitioner": Partitioner(config=partitioner_config),
         }
+        if filterer_config:
+            pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
         if chunker_config:
             pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
         if embedder_config:

unstructured_ingest/v2/pipeline/steps/chunk.py CHANGED Viewed

@@ -5,13 +5,11 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
-from unstructured.staging.base import elements_to_dicts
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.chunker import Chunker
+from unstructured_ingest.v2.utils import serialize_base_model_json
 STEP_ID = "chunk"
@@ -30,11 +28,7 @@ class ChunkStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.chunking_strategy})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
-            if self.process.config
-            else None
-        )
+        config = self.process.config.json() if self.process.config else None
         logger.info(f"Created {self.identifier} with configs: {config}")
     def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
@@ -72,13 +66,13 @@ class ChunkStep(PipelineStep):
             chunked_content_raw = await fn(**fn_kwargs)
         self._save_output(
             output_filepath=str(output_filepath),
-            chunked_content=elements_to_dicts(chunked_content_raw),
+            chunked_content=chunked_content_raw,
         )
         return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
+        hashable_string = serialize_base_model_json(
+            model=self.process.config, sort_keys=True, ensure_ascii=True
         )
         if extras:
             hashable_string += "".join(extras)

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -2,13 +2,14 @@ import asyncio
 import hashlib
 import json
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Callable, Optional, TypedDict, TypeVar
 from unstructured_ingest.v2.interfaces import FileData, download_responses
 from unstructured_ingest.v2.interfaces.downloader import Downloader
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.utils import serialize_base_model_json
 DownloaderT = TypeVar("DownloaderT", bound=Downloader)
@@ -29,15 +30,9 @@ class DownloadStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True))
-            if self.process.download_config
-            else None
-        )
+        config = self.process.download_config.json() if self.process.download_config else None
         connection_config = (
-            sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
-            if self.process.connection_config
-            else None
+            self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "
@@ -70,11 +65,40 @@ class DownloadStep(PipelineStep):
             return True
         return False
+    def update_file_data(
+        self, file_data: FileData, file_data_path: Path, download_path: Path
+    ) -> None:
+        file_size_bytes = download_path.stat().st_size
+        changed = False
+        if not file_data.metadata.filesize_bytes and file_size_bytes:
+            changed = True
+            file_data.metadata.filesize_bytes = file_size_bytes
+        if (
+            file_data.metadata.filesize_bytes
+            and file_data.metadata.filesize_bytes != file_size_bytes
+        ):
+            logger.warning(
+                f"file size in original file data "
+                f"({file_data.metadata.filesize_bytes}) doesn't "
+                f"match size of local file: {file_size_bytes}, updating"
+            )
+            changed = True
+            file_data.metadata.filesize_bytes = file_size_bytes
+        if changed:
+            logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
+            with file_data_path.open("w") as file:
+                json.dump(file_data.to_dict(), file, indent=2)
     async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
         file_data = FileData.from_file(path=file_data_path)
         download_path = self.process.get_download_path(file_data=file_data)
         if not self.should_download(file_data=file_data, file_data_path=file_data_path):
             logger.debug(f"Skipping download, file already exists locally: {download_path}")
+            self.update_file_data(
+                file_data=file_data,
+                file_data_path=Path(file_data_path),
+                download_path=download_path,
+            )
             return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
         fn_kwargs = {"file_data": file_data}
         if not asyncio.iscoroutinefunction(fn):
@@ -85,26 +109,60 @@ class DownloadStep(PipelineStep):
         else:
             download_results = await fn(**fn_kwargs)
         return self.create_step_results(
-            current_file_data_path=file_data_path, download_results=download_results
+            current_file_data_path=file_data_path,
+            download_results=download_results,
+            current_file_data=file_data,
         )
     def create_step_results(
-        self, current_file_data_path: str, download_results: download_responses
+        self,
+        current_file_data_path: str,
+        current_file_data: FileData,
+        download_results: download_responses,
     ) -> list[DownloadStepResponse]:
+        responses = []
         if not isinstance(download_results, list):
-            return [
-                DownloadStepResponse(
-                    file_data_path=current_file_data_path, path=str(download_results["path"])
+            file_data = current_file_data
+            file_data_path = current_file_data_path
+            download_path = download_results["path"]
+            if download_results["file_data"].identifier == current_file_data.identifier:
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
+                )
+                responses = [
+                    DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
+                ]
+            else:
+                file_data = download_results["file_data"]
+                file_data_path = self.persist_new_file_data(file_data=file_data)
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
                 )
-            ]
+                responses = [
+                    DownloadStepResponse(
+                        file_data_path=current_file_data_path, path=str(download_results["path"])
+                    )
+                ]
+        else:
             # Supplemental results generated as part of the download process
-        download_step_results = []
-        for res in download_results:
-            file_data_path = self.persist_new_file_data(file_data=res["file_data"])
-            download_step_results.append(
-                DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
-            )
-        return download_step_results
+            for res in download_results:
+                file_data = res["file_data"]
+                file_data_path = self.persist_new_file_data(file_data=file_data)
+                download_path = res["path"]
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
+                )
+                responses.append(
+                    DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
+                )
+        return responses
     def persist_new_file_data(self, file_data: FileData) -> str:
         record_hash = self.get_hash(extras=[file_data.identifier])
@@ -116,9 +174,17 @@ class DownloadStep(PipelineStep):
         return str(filepath)
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            sterilize_dict(self.process.download_config.to_dict()), sort_keys=True
+        download_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.download_config)
+        )
+        connection_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.connection_config)
         )
+        hashable_dict = {
+            "download_config": download_config_dict,
+            "connection_config": connection_config_dict,
+        }
+        hashable_string = json.dumps(hashable_dict, sort_keys=True)
         if extras:
             hashable_string += "".join(extras)
         return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -5,13 +5,11 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
-from unstructured.staging.base import elements_to_dicts
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.embedder import Embedder
+from unstructured_ingest.v2.utils import serialize_base_model_json
 STEP_ID = "embed"
@@ -30,11 +28,7 @@ class EmbedStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.embedding_provider})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
-            if self.process.config
-            else None
-        )
+        config = self.process.config.json() if self.process.config else None
         logger.info(f"Created {self.identifier} with configs: {config}")
     def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -71,13 +65,13 @@ class EmbedStep(PipelineStep):
         self._save_output(
             output_filepath=str(output_filepath),
-            embedded_content=elements_to_dicts(embed_content_raw),
+            embedded_content=embed_content_raw,
         )
         return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
+        hashable_string = serialize_base_model_json(
+            model=self.process.config, sort_keys=True, ensure_ascii=True
         )
         if extras:
             hashable_string += "".join(extras)

unstructured_ingest/v2/pipeline/steps/filter.py ADDED Viewed

@@ -0,0 +1,35 @@
+import asyncio
+from dataclasses import dataclass
+from typing import Callable, Optional
+from unstructured_ingest.v2.interfaces.file_data import FileData
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
+from unstructured_ingest.v2.processes.filter import Filterer
+STEP_ID = "filter"
+@dataclass
+class FilterStep(PipelineStep):
+    process: Filterer
+    identifier: str = STEP_ID
+    def __post_init__(self):
+        config = self.process.config.json() if self.process.config else None
+        logger.info(f"Created {self.identifier} with configs: {config}")
+    async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
+        file_data = FileData.from_file(path=file_data_path)
+        fn_kwargs = {"file_data": file_data}
+        if not asyncio.iscoroutinefunction(fn):
+            resp = fn(**fn_kwargs)
+        elif semaphore := self.context.semaphore:
+            async with semaphore:
+                resp = await fn(**fn_kwargs)
+        else:
+            resp = await fn(**fn_kwargs)
+        if resp:
+            return {"file_data_path": file_data_path}
+        return None

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Generator, Optional, TypeVar
 from unstructured_ingest.v2.interfaces.indexer import Indexer
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.utils import serialize_base_model_json
 IndexerT = TypeVar("IndexerT", bound=Indexer)
@@ -22,15 +22,9 @@ class IndexStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
-            if self.process.index_config
-            else None
-        )
+        config = self.process.index_config.json() if self.process.index_config else None
         connection_config = (
-            sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
-            if self.process.connection_config
-            else None
+            self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "
@@ -55,7 +49,17 @@ class IndexStep(PipelineStep):
                 continue
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(self.process.index_config.to_dict())
+        index_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.index_config, sort_keys=True)
+        )
+        connection_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.connection_config, sort_keys=True)
+        )
+        hashable_dict = {
+            "index_config": index_config_dict,
+            "connection_config": connection_config_dict,
+        }
+        hashable_string = json.dumps(hashable_dict, sort_keys=True)
         if extras:
             hashable_string += "".join(extras)
         return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -8,8 +8,8 @@ from typing import Callable, Optional, TypedDict
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.partitioner import Partitioner
+from unstructured_ingest.v2.utils import serialize_base_model_json
 STEP_ID = "partition"
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.strategy})"
     def __post_init__(self):
-        config = sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
+        config = self.process.config.json()
         logger.info(f"Created {self.identifier} with configs: {config}")
     def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -56,7 +56,7 @@ class PartitionStep(PipelineStep):
         if not self.should_partition(filepath=output_filepath, file_data=file_data):
             logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
             return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
-        fn_kwargs = {"filename": path, "metadata": file_data.metadata}
+        fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
         if not asyncio.iscoroutinefunction(fn):
             partitioned_content = fn(**fn_kwargs)
         elif semaphore := self.context.semaphore:
@@ -70,8 +70,8 @@ class PartitionStep(PipelineStep):
         return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
+        hashable_string = serialize_base_model_json(
+            model=self.process.config, sort_keys=True, ensure_ascii=True
         )
         if extras:
             hashable_string += "".join(extras)

unstructured_ingest/v2/pipeline/steps/stage.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import asyncio
 import hashlib
-import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
@@ -9,7 +8,7 @@ from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.utils import serialize_base_model_json
 STEP_ID = "upload_stage"
@@ -29,9 +28,7 @@ class UploadStageStep(PipelineStep):
     def __post_init__(self):
         config = (
-            sterilize_dict(self.process.upload_stager_config.to_dict(redact_sensitive=True))
-            if self.process.upload_stager_config
-            else None
+            self.process.upload_stager_config.json() if self.process.upload_stager_config else None
         )
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         logger.info(f"Created {self.identifier} with configs: {config}")
@@ -56,8 +53,8 @@ class UploadStageStep(PipelineStep):
         return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            self.process.upload_stager_config.to_dict(), sort_keys=True, ensure_ascii=True
+        hashable_string = serialize_base_model_json(
+            model=self.process.upload_stager_config, sort_keys=True, ensure_ascii=True
         )
         if extras:
             hashable_string += "".join(extras)

unstructured_ingest/v2/pipeline/steps/uncompress.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Callable, TypedDict
 from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.uncompress import Uncompressor
 STEP_ID = "uncompress"
@@ -21,11 +20,7 @@ class UncompressStep(PipelineStep):
     identifier: str = STEP_ID
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
-            if self.process.config
-            else None
-        )
+        config = self.process.config.json() if self.process.config else None
         logger.info(f"Created {self.identifier} with configs: {config}")
     def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:

unstructured_ingest/v2/pipeline/steps/upload.py CHANGED Viewed

@@ -7,7 +7,6 @@ from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 STEP_ID = "upload"
@@ -26,15 +25,9 @@ class UploadStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True))
-            if self.process.upload_config
-            else None
-        )
+        config = self.process.upload_config.json() if self.process.upload_config else None
         connection_config = (
-            sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
-            if self.process.connection_config
-            else None
+            self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "

unstructured_ingest/v2/processes/__init__.py CHANGED Viewed

@@ -0,0 +1,18 @@
+from .chunker import Chunker, ChunkerConfig
+from .embedder import Embedder, EmbedderConfig
+from .filter import Filterer, FiltererConfig
+from .partitioner import Partitioner, PartitionerConfig
+from .uncompress import UncompressConfig, Uncompressor
+__all__ = [
+    "Chunker",
+    "ChunkerConfig",
+    "Embedder",
+    "EmbedderConfig",
+    "Filterer",
+    "FiltererConfig",
+    "Partitioner",
+    "PartitionerConfig",
+    "Uncompressor",
+    "UncompressConfig",
+]

unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.2.dev0py3-none-any.whl → 0.0.4py3-none-any.whl