PyPI - unstructured-ingest - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

unstructured-ingest 0.0.0py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.0" # pragma: no cover
1	+ __version__ = "0.0.2" # pragma: no cover

unstructured_ingest/connector/notion/helpers.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import List, Optional, Tuple
 from urllib.parse import urlparse
 from uuid import UUID
-import unstructured.ingest.connector.notion.types.blocks as notion_blocks
 from htmlBuilder.attributes import Style, Type
 from htmlBuilder.tags import (
     Body,
@@ -23,6 +22,7 @@ from htmlBuilder.tags import (
 )
 from notion_client.errors import APIResponseError
+import unstructured_ingest.connector.notion.types.blocks as notion_blocks
 from unstructured_ingest.connector.notion.client import Client
 from unstructured_ingest.connector.notion.interfaces import BlockBase
 from unstructured_ingest.connector.notion.types.block import Block

unstructured_ingest/logger.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import logging
 import typing as t
-logger = logging.getLogger("unstructured.ingest")
+logger = logging.getLogger("unstructured_ingest")
 def default_is_data_sensitive(k: str, v: t.Any) -> bool:
@@ -119,7 +119,7 @@ def ingest_log_streaming_init(level: int) -> None:
 def make_default_logger(level: int) -> logging.Logger:
     """Return a custom logger."""
-    logger = logging.getLogger("unstructured.ingest")
+    logger = logging.getLogger("unstructured_ingest")
     handler = logging.StreamHandler()
     handler.name = "ingest_log_handler"
     formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")

unstructured_ingest/v2/cli/base/cmd.py CHANGED Viewed

@@ -24,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
+from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
 from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
 CommandT = TypeVar("CommandT", bound=click.Command)
@@ -75,6 +76,8 @@ class BaseCmd(ABC):
         }
         if chunker := self.get_chunker(options=source_options):
             pipeline_kwargs["chunker"] = chunker
+        if filterer := self.get_filterer(options=source_options):
+            pipeline_kwargs["filterer"] = filterer
         if embedder := self.get_embeder(options=source_options):
             pipeline_kwargs["embedder"] = embedder
         if dest:
@@ -105,6 +108,13 @@ class BaseCmd(ABC):
             return None
         return Chunker(config=chunker_config)
+    @staticmethod
+    def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
+        filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
+        if not filterer_configs.to_dict():
+            return None
+        return Filterer(config=filterer_configs)
     @staticmethod
     def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
         embedder_config = extract_config(flat_data=options, config=EmbedderConfig)

unstructured_ingest/v2/cli/base/src.py CHANGED Viewed

@@ -8,6 +8,7 @@ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
 from unstructured_ingest.v2.cli.configs import (
     ChunkerCliConfig,
     EmbedderCliConfig,
+    FilterCliConfig,
     PartitionerCliConfig,
     ProcessorCliConfig,
 )
@@ -26,6 +27,7 @@ class SrcCmd(BaseCmd):
             ProcessorCliConfig,
             PartitionerCliConfig,
             EmbedderCliConfig,
+            FilterCliConfig,
             ChunkerCliConfig,
         ]
     )

unstructured_ingest/v2/cli/cmds/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
 from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
 from .google_drive import google_drive_src_cmd
 from .local import local_dest_cmd, local_src_cmd
+from .milvus import milvus_dest_cmd
 from .mongodb import mongodb_dest_cmd
 from .onedrive import onedrive_drive_src_cmd
 from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
@@ -60,6 +61,7 @@ dest_cmds = [
     elasticsearch_dest_cmd,
     gcs_dest_cmd,
     local_dest_cmd,
+    milvus_dest_cmd,
     opensearch_dest_cmd,
     pinecone_dest_cmd,
     s3_dest_cmd,

unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py CHANGED Viewed

@@ -3,7 +3,6 @@ from dataclasses import dataclass
 import click
 from unstructured_ingest.v2.cli.interfaces import CliConfig
-from unstructured_ingest.v2.cli.utils import DelimitedString
 @dataclass
@@ -14,7 +13,7 @@ class FsspecCliDownloadConfig(CliConfig):
             click.Option(
                 ["--download-dir"],
                 help="Where files are downloaded to, defaults to a location at"
-                "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
+                "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
             ),
         ]
@@ -65,13 +64,6 @@ class FsspecCliIndexerConfig(FsspecCliFileConfig):
                     help="Recursively download files in their respective folders "
                     "otherwise stop at the files in provided folder level.",
                 ),
-                click.Option(
-                    ["--file-glob"],
-                    default=None,
-                    type=DelimitedString(),
-                    help="A comma-separated list of file globs to limit which types of "
-                    "local files are accepted, e.g. '*.html,*.txt'",
-                ),
             ]
         )
         return options

unstructured_ingest/v2/cli/cmds/local.py CHANGED Viewed

@@ -4,7 +4,6 @@ import click
 from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
 from unstructured_ingest.v2.cli.interfaces import CliConfig
-from unstructured_ingest.v2.cli.utils import DelimitedString
 from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
@@ -19,13 +18,6 @@ class LocalCliIndexerConfig(CliConfig):
                 type=click.Path(file_okay=True, dir_okay=True, exists=True),
                 help="Path to the location in the local file system that will be processed.",
             ),
-            click.Option(
-                ["--file-glob"],
-                default=None,
-                type=DelimitedString(),
-                help="A comma-separated list of file globs to limit which types of "
-                "local files are accepted, e.g. '*.html,*.txt'",
-            ),
             click.Option(
                 ["--recursive"],
                 is_flag=True,

unstructured_ingest/v2/cli/cmds/milvus.py ADDED Viewed

@@ -0,0 +1,72 @@
+from dataclasses import dataclass
+import click
+from unstructured_ingest.v2.cli.base import DestCmd
+from unstructured_ingest.v2.cli.interfaces import CliConfig
+from unstructured_ingest.v2.processes.connectors.milvus import CONNECTOR_TYPE
+@dataclass
+class MilvusCliConnectionConfig(CliConfig):
+    @staticmethod
+    def get_cli_options() -> list[click.Option]:
+        options = [
+            click.Option(
+                ["--uri"],
+                required=False,
+                type=str,
+                default=None,
+                help="Milvus uri, eg 'http://localhost:19530",
+            ),
+            click.Option(
+                ["--user"],
+                required=False,
+                type=str,
+                default=None,
+                help="Milvus user",
+            ),
+            click.Option(
+                ["--password"],
+                required=False,
+                type=str,
+                default=None,
+                help="Milvus password",
+            ),
+            click.Option(
+                ["--db-name"],
+                required=False,
+                type=str,
+                default=None,
+                help="Milvus database name",
+            ),
+        ]
+        return options
+@dataclass
+class MilvusCliUploaderConfig(CliConfig):
+    @staticmethod
+    def get_cli_options() -> list[click.Option]:
+        options = [
+            click.Option(
+                ["--collection-name"],
+                required=True,
+                type=str,
+                help="Milvus collections to write to",
+            ),
+            click.Option(
+                ["--num-of-processes"],
+                type=click.IntRange(min=1),
+                default=4,
+                help="number of processes to use when writing to support parallel writes",
+            ),
+        ]
+        return options
+milvus_dest_cmd = DestCmd(
+    cmd_name=CONNECTOR_TYPE,
+    connection_config=MilvusCliConnectionConfig,
+    uploader_config=MilvusCliUploaderConfig,
+)

unstructured_ingest/v2/cli/configs/__init__.py CHANGED Viewed

@@ -1,6 +1,13 @@
 from .chunk import ChunkerCliConfig
 from .embed import EmbedderCliConfig
+from .filter import FilterCliConfig
 from .partition import PartitionerCliConfig
 from .processor import ProcessorCliConfig
-__all__ = ["ChunkerCliConfig", "ProcessorCliConfig", "PartitionerCliConfig", "EmbedderCliConfig"]
+__all__ = [
+    "ChunkerCliConfig",
+    "ProcessorCliConfig",
+    "PartitionerCliConfig",
+    "EmbedderCliConfig",
+    "FilterCliConfig",
+]

unstructured_ingest/v2/cli/configs/filter.py ADDED Viewed

@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+import click
+from unstructured_ingest.v2.cli.interfaces import CliConfig
+from unstructured_ingest.v2.cli.utils import DelimitedString
+@dataclass
+class FilterCliConfig(CliConfig):
+    @staticmethod
+    def get_cli_options() -> list[click.Option]:
+        options = [
+            click.Option(
+                ["--file-glob"],
+                default=None,
+                type=DelimitedString(),
+                help="A comma-separated list of file globs to limit which types of "
+                "local files are accepted, e.g. '*.html,*.txt'",
+            ),
+            click.Option(
+                ["--max-file-size"],
+                default=None,
+                type=click.IntRange(min=1),
+                help="Max file size to process in bytes",
+            ),
+        ]
+        return options

unstructured_ingest/v2/interfaces/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .connector import AccessConfig, BaseConnector, ConnectionConfig
 from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
-from .file_data import FileData, SourceIdentifiers
+from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
 from .indexer import Indexer, IndexerConfig
 from .process import BaseProcess
 from .processor import ProcessorConfig
@@ -26,4 +26,5 @@ __all__ = [
     "AccessConfig",
     "ConnectionConfig",
     "BaseConnector",
+    "FileDataSourceMetadata",
 ]

unstructured_ingest/v2/interfaces/downloader.py CHANGED Viewed

@@ -30,6 +30,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
     connector_type: str
     download_config: DownloaderConfigT
+    def get_download_path(self, file_data: FileData) -> Optional[Path]:
+        if not file_data.source_identifiers:
+            return None
+        rel_path = file_data.source_identifiers.relative_path
+        if not rel_path:
+            return None
+        rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
+        return self.download_dir / Path(rel_path)
     @staticmethod
     def is_float(value: str):
         try:
@@ -68,9 +77,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
     def is_async(self) -> bool:
         return True
-    def get_download_path(self, file_data: FileData) -> Optional[Path]:
-        return None
     @abstractmethod
     def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
         pass

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -22,13 +22,18 @@ class SourceIdentifiers:
         return self.rel_path or self.fullpath
+@dataclass
+class FileDataSourceMetadata(DataSourceMetadata):
+    filesize_bytes: Optional[int] = None
 @dataclass
 class FileData(DataClassJsonMixin):
     identifier: str
     connector_type: str
     source_identifiers: Optional[SourceIdentifiers] = None
     doc_type: Literal["file", "batch"] = field(default="file")
-    metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
+    metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
     additional_metadata: dict[str, Any] = field(default_factory=dict)
     reprocess: bool = False

unstructured_ingest/v2/interfaces/process.py CHANGED Viewed

@@ -8,6 +8,9 @@ class BaseProcess(ABC):
     def is_async(self) -> bool:
         return False
+    def precheck(self) -> None:
+        pass
     @abstractmethod
     def run(self, **kwargs: Any) -> Any:
         pass

unstructured_ingest/v2/logger.py CHANGED Viewed

@@ -5,7 +5,7 @@ from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
 from typing import Any, Callable
 log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
-LOGGER_NAME = "unstructured.ingest.v2"
+LOGGER_NAME = "unstructured_ingest.v2"
 def default_is_data_sensitive(k: str, v: Any) -> bool:

unstructured_ingest/v2/pipeline/interfaces.py CHANGED Viewed

@@ -92,7 +92,7 @@ class PipelineStep(ABC):
         if iterable:
             if len(iterable) == 1:
-                return [self.process_serially(iterable)]
+                return self.process_serially(iterable)
             if self.context.num_processes == 1:
                 return self.process_serially(iterable)
             with mp.Pool(
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
             logger.info(
                 f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
             )
+        else:
+            logger.info(f"Calling {self.__class__.__name__} with no inputs")
         if self.context.async_supported and self.process.is_async():
             return self.process_async(iterable=iterable)
         if self.context.mp_supported:

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -9,6 +9,7 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
 from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
 from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
 from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
+from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
 from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
 from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
 from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
@@ -27,6 +28,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 from unstructured_ingest.v2.processes.connectors.local import LocalUploader
 from unstructured_ingest.v2.processes.embedder import EmbedderConfig
+from unstructured_ingest.v2.processes.filter import FiltererConfig
 from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
@@ -37,22 +39,33 @@ class PipelineError(Exception):
 @dataclass
 class Pipeline:
     context: ProcessorConfig
     indexer: InitVar[IndexerT]
     indexer_step: IndexStep = field(init=False)
     downloader: InitVar[DownloaderT]
     downloader_step: DownloadStep = field(init=False)
     partitioner: InitVar[Partitioner]
     partitioner_step: PartitionStep = field(init=False)
     chunker: InitVar[Optional[Chunker]] = None
     chunker_step: ChunkStep = field(init=False, default=None)
     embedder: InitVar[Optional[Embedder]] = None
     embedder_step: EmbedStep = field(init=False, default=None)
     stager: InitVar[Optional[UploadStager]] = None
     stager_step: UploadStageStep = field(init=False, default=None)
     uploader: InitVar[Uploader] = field(default=LocalUploader())
     uploader_step: UploadStep = field(init=False, default=None)
     uncompress_step: UncompressStep = field(init=False, default=None)
+    filterer: InitVar[Optional[Filterer]] = None
+    filter_step: FilterStep = field(init=False, default=None)
     def __post_init__(
         self,
         indexer: IndexerT,
@@ -62,10 +75,12 @@ class Pipeline:
         embedder: Embedder = None,
         stager: UploadStager = None,
         uploader: Uploader = None,
+        filterer: Filterer = None,
     ):
         make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
         self.indexer_step = IndexStep(process=indexer, context=self.context)
         self.downloader_step = DownloadStep(process=downloader, context=self.context)
+        self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
         self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
         self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
@@ -109,6 +124,7 @@ class Pipeline:
     def run(self):
         try:
             start_time = time()
+            self._run_prechecks()
             self._run()
             logger.info(f"Finished ingest process in {time() - start_time}s")
         finally:
@@ -130,6 +146,37 @@ class Pipeline:
         final = [f for f in flat if f]
         return final or None
+    def _run_prechecks(self):
+        steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
+        if self.chunker_step:
+            steps.append(self.chunker_step)
+        if self.embedder_step:
+            steps.append(self.embedder_step)
+        if self.uncompress_step:
+            steps.append(self.uncompress_step)
+        if self.stager_step:
+            steps.append(self.stager_step)
+        failures = {}
+        for step in steps:
+            try:
+                step.process.precheck()
+            except Exception as e:
+                failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
+        if failures:
+            for k, v in failures.items():
+                logger.error(f"Step precheck failure: {k}: {v}")
+            raise PipelineError("Precheck failed")
+    def apply_filter(self, records: list[dict]) -> list[dict]:
+        if not self.filter_step:
+            return records
+        data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
+        filtered_data = self.filter_step(data_to_filter)
+        filtered_data = [f for f in filtered_data if f is not None]
+        filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
+        filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
+        return filtered_records
     def _run(self):
         logger.info(
             f"Running local pipline: {self} with configs: "
@@ -147,18 +194,33 @@ class Pipeline:
         if not indices_inputs:
             return
+        # Initial filtering on indexed content
+        indices_inputs = self.apply_filter(records=indices_inputs)
+        if not indices_inputs:
+            return
         # Download associated content to local file system
         downloaded_data = self.downloader_step(indices_inputs)
         downloaded_data = self.clean_results(results=downloaded_data)
         if not downloaded_data:
             return
+        # Post download filtering
+        downloaded_data = self.apply_filter(records=downloaded_data)
+        if not downloaded_data:
+            return
         # Run uncompress if available
         if self.uncompress_step:
             downloaded_data = self.uncompress_step(downloaded_data)
             # Flatten list of lists
             downloaded_data = self.clean_results(results=downloaded_data)
+            # Post uncompress filtering
+            downloaded_data = self.apply_filter(records=downloaded_data)
+            if not downloaded_data:
+                return
         if not downloaded_data:
             return
@@ -179,9 +241,14 @@ class Pipeline:
         self.uploader_step(iterable=elements)
     def __str__(self):
-        s = [str(self.indexer_step), str(self.downloader_step)]
+        s = [str(self.indexer_step)]
+        if filter_step := self.filter_step:
+            s.append(str(filter_step))
+        s.append(str(self.downloader_step))
+        if filter_step := self.filter_step:
+            s.append(str(filter_step))
         if uncompress_step := self.uncompress_step:
-            s.append(str(uncompress_step))
+            s.extend([str(uncompress_step), str(filter_step)])
         s.append(str(self.partitioner_step))
         if chunker_step := self.chunker_step:
             s.append(str(chunker_step))
@@ -200,6 +267,7 @@ class Pipeline:
         downloader_config: DownloaderConfigT,
         source_connection_config: ConnectionConfig,
         partitioner_config: PartitionerConfig,
+        filterer_config: FiltererConfig = None,
         chunker_config: Optional[ChunkerConfig] = None,
         embedder_config: Optional[EmbedderConfig] = None,
         destination_connection_config: Optional[ConnectionConfig] = None,
@@ -235,6 +303,8 @@ class Pipeline:
             ),
             "partitioner": Partitioner(config=partitioner_config),
         }
+        if filterer_config:
+            pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
         if chunker_config:
             pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
         if embedder_config:

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import hashlib
 import json
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Callable, Optional, TypedDict, TypeVar
 from unstructured_ingest.v2.interfaces import FileData, download_responses
@@ -70,11 +71,40 @@ class DownloadStep(PipelineStep):
             return True
         return False
+    def update_file_data(
+        self, file_data: FileData, file_data_path: Path, download_path: Path
+    ) -> None:
+        file_size_bytes = download_path.stat().st_size
+        changed = False
+        if not file_data.metadata.filesize_bytes and file_size_bytes:
+            changed = True
+            file_data.metadata.filesize_bytes = file_size_bytes
+        if (
+            file_data.metadata.filesize_bytes
+            and file_data.metadata.filesize_bytes != file_size_bytes
+        ):
+            logger.warning(
+                f"file size in original file data "
+                f"({file_data.metadata.filesize_bytes}) doesn't "
+                f"match size of local file: {file_size_bytes}, updating"
+            )
+            changed = True
+            file_data.metadata.filesize_bytes = file_size_bytes
+        if changed:
+            logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
+            with file_data_path.open("w") as file:
+                json.dump(file_data.to_dict(), file, indent=2)
     async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
         file_data = FileData.from_file(path=file_data_path)
         download_path = self.process.get_download_path(file_data=file_data)
         if not self.should_download(file_data=file_data, file_data_path=file_data_path):
             logger.debug(f"Skipping download, file already exists locally: {download_path}")
+            self.update_file_data(
+                file_data=file_data,
+                file_data_path=Path(file_data_path),
+                download_path=download_path,
+            )
             return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
         fn_kwargs = {"file_data": file_data}
         if not asyncio.iscoroutinefunction(fn):
@@ -85,26 +115,60 @@ class DownloadStep(PipelineStep):
         else:
             download_results = await fn(**fn_kwargs)
         return self.create_step_results(
-            current_file_data_path=file_data_path, download_results=download_results
+            current_file_data_path=file_data_path,
+            download_results=download_results,
+            current_file_data=file_data,
         )
     def create_step_results(
-        self, current_file_data_path: str, download_results: download_responses
+        self,
+        current_file_data_path: str,
+        current_file_data: FileData,
+        download_results: download_responses,
     ) -> list[DownloadStepResponse]:
+        responses = []
         if not isinstance(download_results, list):
-            return [
-                DownloadStepResponse(
-                    file_data_path=current_file_data_path, path=str(download_results["path"])
+            file_data = current_file_data
+            file_data_path = current_file_data_path
+            download_path = download_results["path"]
+            if download_results["file_data"].identifier == current_file_data.identifier:
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
+                )
+                responses = [
+                    DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
+                ]
+            else:
+                file_data = download_results["file_data"]
+                file_data_path = self.persist_new_file_data(file_data=file_data)
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
                 )
-            ]
+                responses = [
+                    DownloadStepResponse(
+                        file_data_path=current_file_data_path, path=str(download_results["path"])
+                    )
+                ]
+        else:
             # Supplemental results generated as part of the download process
-        download_step_results = []
-        for res in download_results:
-            file_data_path = self.persist_new_file_data(file_data=res["file_data"])
-            download_step_results.append(
-                DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
-            )
-        return download_step_results
+            for res in download_results:
+                file_data = res["file_data"]
+                file_data_path = self.persist_new_file_data(file_data=file_data)
+                download_path = res["path"]
+                self.update_file_data(
+                    file_data=file_data,
+                    file_data_path=Path(file_data_path),
+                    download_path=download_path,
+                )
+                responses.append(
+                    DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
+                )
+        return responses
     def persist_new_file_data(self, file_data: FileData) -> str:
         record_hash = self.get_hash(extras=[file_data.identifier])

unstructured-ingest 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.0py3-none-any.whl → 0.0.2py3-none-any.whl