PyPI - unstructured-ingest - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

unstructured-ingest 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (82) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/interfaces.py +1 -1
unstructured_ingest/cli/utils.py +1 -1
unstructured_ingest/connector/astradb.py +1 -1
unstructured_ingest/connector/biomed.py +4 -4
unstructured_ingest/connector/chroma.py +1 -1
unstructured_ingest/connector/databricks_volumes.py +2 -2
unstructured_ingest/connector/fsspec/box.py +1 -1
unstructured_ingest/connector/fsspec/fsspec.py +5 -5
unstructured_ingest/connector/git.py +1 -1
unstructured_ingest/connector/google_drive.py +4 -4
unstructured_ingest/connector/hubspot.py +1 -1
unstructured_ingest/connector/kafka.py +8 -8
unstructured_ingest/connector/local.py +1 -1
unstructured_ingest/connector/notion/helpers.py +4 -4
unstructured_ingest/connector/onedrive.py +3 -3
unstructured_ingest/connector/outlook.py +2 -2
unstructured_ingest/connector/pinecone.py +1 -1
unstructured_ingest/connector/sharepoint.py +8 -8
unstructured_ingest/connector/vectara.py +6 -6
unstructured_ingest/embed/__init__.py +17 -0
unstructured_ingest/embed/bedrock.py +70 -0
unstructured_ingest/embed/huggingface.py +73 -0
unstructured_ingest/embed/interfaces.py +36 -0
unstructured_ingest/embed/mixedbreadai.py +177 -0
unstructured_ingest/embed/octoai.py +63 -0
unstructured_ingest/embed/openai.py +61 -0
unstructured_ingest/embed/vertexai.py +88 -0
unstructured_ingest/embed/voyageai.py +69 -0
unstructured_ingest/interfaces.py +21 -11
unstructured_ingest/logger.py +1 -1
unstructured_ingest/pipeline/copy.py +1 -1
unstructured_ingest/pipeline/interfaces.py +2 -2
unstructured_ingest/pipeline/partition.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/pipeline/reformat/chunking.py +2 -2
unstructured_ingest/pipeline/reformat/embedding.py +4 -6
unstructured_ingest/pipeline/source.py +2 -2
unstructured_ingest/utils/compression.py +3 -3
unstructured_ingest/utils/data_prep.py +20 -12
unstructured_ingest/utils/string_and_date_utils.py +2 -2
unstructured_ingest/v2/cli/base/cmd.py +3 -3
unstructured_ingest/v2/cli/base/dest.py +1 -1
unstructured_ingest/v2/cli/base/src.py +3 -2
unstructured_ingest/v2/cli/utils/click.py +1 -1
unstructured_ingest/v2/interfaces/processor.py +48 -13
unstructured_ingest/v2/logger.py +1 -1
unstructured_ingest/v2/otel.py +1 -1
unstructured_ingest/v2/pipeline/interfaces.py +12 -3
unstructured_ingest/v2/pipeline/pipeline.py +42 -29
unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
unstructured_ingest/v2/pipeline/steps/download.py +17 -2
unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
unstructured_ingest/v2/pipeline/steps/index.py +2 -2
unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
unstructured_ingest/v2/processes/connectors/local.py +6 -5
unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
unstructured_ingest/v2/processes/embedder.py +41 -24
unstructured_ingest/v2/processes/filter.py +1 -1
unstructured_ingest/v2/processes/partitioner.py +3 -3
unstructured_ingest/v2/utils.py +7 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
unstructured_ingest/evaluate.py +0 -338
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0

unstructured_ingest/pipeline/reformat/embedding.py CHANGED Viewed

@@ -27,8 +27,6 @@ class Embedder(ReformatNode):
         return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
     def run(self, elements_json: str) -> Optional[str]:
-        from unstructured.staging.base import elements_from_json
         try:
             elements_json_filename = os.path.basename(elements_json)
             filename_ext = os.path.basename(elements_json_filename)
@@ -46,12 +44,12 @@ class Embedder(ReformatNode):
                 and json_path.is_file()
                 and json_path.stat().st_size
             ):
-                logger.debug(f"File exists: {json_path}, skipping embedding")
+                logger.debug(f"file exists: {json_path}, skipping embedding")
                 return str(json_path)
-            elements = elements_from_json(filename=elements_json)
+            with open(elements_json) as f:
+                elements = json.load(f)
             embedder = self.embedder_config.get_embedder()
-            embedded_elements = embedder.embed_documents(elements=elements)
-            element_dicts = [e.to_dict() for e in embedded_elements]
+            element_dicts = embedder.embed_documents(elements=elements)
             with open(json_path, "w", encoding="utf8") as output_f:
                 logger.info(f"writing embeddings content to {json_path}")
                 json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)

unstructured_ingest/pipeline/source.py CHANGED Viewed

@@ -24,12 +24,12 @@ class Reader(SourceNode):
             and doc.filename.is_file()
             and doc.filename.stat().st_size
         ):
-            logger.info(f"File exists: {doc.filename}, skipping download")
+            logger.info(f"file exists: {doc.filename}, skipping download")
             # Still need to fetch metadata if file exists locally
             doc.update_source_metadata()
         else:
             serialized_doc = doc.to_json(redact_sensitive=True)
-            logger.debug(f"Fetching {serialized_doc} - PID: {os.getpid()}")
+            logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
             if self.retry_strategy:
                 self.retry_strategy(doc.get_file)
             else:

unstructured_ingest/utils/compression.py CHANGED Viewed

@@ -22,7 +22,7 @@ TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
 def uncompress_file(filename: str, path: Optional[str] = None) -> str:
     """
-    Takes in a compressed zip or tar file and uncompresses it
+    Takes in a compressed zip or tar file and decompresses it
     """
     # Create path if it doesn't already exist
     if path:
@@ -65,7 +65,7 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
     logger.info(f"extracting tar {tar_filename} -> {path}")
     # NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
     with tarfile.open(tar_filename, "r:*") as tfile:
-        # NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
+        # NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
         # This was added in Python 3.12
         # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
         if sys.version_info >= (3, 12):
@@ -113,6 +113,6 @@ class CompressionSourceConnectorMixin:
             read_config=new_read_configs,
             processor_config=new_process_configs,
         )
-        logger.info(f"Created local source connector: {local_connector.to_json()}")
+        logger.info(f"created local source connector: {local_connector.to_json()}")
         local_connector.initialize()
         return local_connector.get_ingest_docs()

unstructured_ingest/utils/data_prep.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import itertools
 import json
 from datetime import datetime
-from typing import Any, Optional, Sequence, cast
+from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
 DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
+T = TypeVar("T")
+IterableT = Iterable[T]
-def batch_generator(iterable, batch_size=100):
+def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
     """A helper function to break an iterable into batches of size batch_size."""
     it = iter(iterable)
     chunk = tuple(itertools.islice(it, batch_size))
@@ -16,23 +19,28 @@ def batch_generator(iterable, batch_size=100):
 def generator_batching_wbytes(
-    iterable, batch_size_limit_bytes=15_000_000, max_batch_size: int = 1000
-):
+    iterable: IterableT,
+    batch_size_limit_bytes: Optional[int] = None,
+    max_batch_size: Optional[int] = None,
+) -> IterableT:
+    if not batch_size_limit_bytes and not max_batch_size:
+        return iterable
     """A helper function to break an iterable into chunks of specified bytes."""
     current_batch, current_batch_size = [], 0
     for item in iterable:
         item_size_bytes = len(json.dumps(item).encode("utf-8"))
-        if (
-            current_batch_size + item_size_bytes <= batch_size_limit_bytes
-            or len(current_batch) == 0  # prevent inifite yielding of empty batch
-        ) and len(current_batch) < max_batch_size:
-            current_batch.append(item)
-            current_batch_size += item_size_bytes
-        else:
+        if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
+            yield current_batch
+            current_batch, current_batch_size = [item], item_size_bytes
+            continue
+        if max_batch_size and len(current_batch) + 1 > max_batch_size:
             yield current_batch
             current_batch, current_batch_size = [item], item_size_bytes
+            continue
+        current_batch.append(item)
+        current_batch_size += item_size_bytes
     if current_batch:
         yield current_batch

unstructured_ingest/utils/string_and_date_utils.py CHANGED Viewed

@@ -10,13 +10,13 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
     try:
         return json.loads(json_string)
     except json.JSONDecodeError:
-        # Not neccessary an error if it is a path or malformed json
+        # Not necessary an error if it is a path or malformed json
         pass
     try:
         # This is common when single quotes are used instead of double quotes
         return json.loads(json_string.replace("'", '"'))
     except json.JSONDecodeError:
-        # Not neccessary an error if it is a path
+        # Not necessary an error if it is a path
         pass
     return json_string

unstructured_ingest/v2/cli/base/cmd.py CHANGED Viewed

@@ -102,7 +102,7 @@ class BaseCmd(ABC):
         cmd.params.extend(options)
         return cmd
-    def get_pipline(
+    def get_pipeline(
         self,
         src: str,
         source_options: dict[str, Any],
@@ -122,7 +122,7 @@ class BaseCmd(ABC):
             pipeline_kwargs["chunker"] = chunker
         if filterer := self.get_filterer(options=source_options):
             pipeline_kwargs["filterer"] = filterer
-        if embedder := self.get_embeder(options=source_options):
+        if embedder := self.get_embedder(options=source_options):
             pipeline_kwargs["embedder"] = embedder
         if dest:
             logger.debug(
@@ -160,7 +160,7 @@ class BaseCmd(ABC):
         return Filterer(config=filterer_configs)
     @staticmethod
-    def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
+    def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
         embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
         if not embedder_config.embedding_provider:
             return None

unstructured_ingest/v2/cli/base/dest.py CHANGED Viewed

@@ -40,7 +40,7 @@ class DestCmd(BaseCmd):
         source_options: dict = ctx.parent.params if ctx.parent else {}
         conform_click_options(options)
         try:
-            pipeline = self.get_pipline(
+            pipeline = self.get_pipeline(
                 src=source_cmd,
                 source_options=source_options,
                 dest=self.cmd_name,

unstructured_ingest/v2/cli/base/src.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 from dataclasses import dataclass, field
+from typing import Any
 import click
 from pydantic import BaseModel
@@ -47,14 +48,14 @@ class SrcCmd(BaseCmd):
         options = self.consolidate_options(options=options)
         return options
-    def cmd(self, ctx: click.Context, **options) -> None:
+    def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
         if ctx.invoked_subcommand:
             return
         conform_click_options(options)
         logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
         try:
-            pipeline = self.get_pipline(src=self.cmd_name, source_options=options)
+            pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
             pipeline.run()
         except Exception as e:
             logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)

unstructured_ingest/v2/cli/utils/click.py CHANGED Viewed

@@ -67,7 +67,7 @@ class FileOrJson(click.ParamType):
                     return value
         self.fail(
             gettext(
-                "{value} is not a valid json string nor an existing filepath.",
+                "{value} is neither a valid json string nor an existing filepath.",
             ).format(value=value),
             param,
             ctx,

unstructured_ingest/v2/interfaces/processor.py CHANGED Viewed

@@ -11,21 +11,56 @@ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pi
 class ProcessorConfig(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    reprocess: bool = False
-    verbose: bool = False
-    tqdm: bool = False
-    work_dir: str = Field(default_factory=lambda: DEFAULT_WORK_DIR)
-    num_processes: int = 2
-    max_connections: Optional[int] = None
-    raise_on_error: bool = False
+    reprocess: bool = Field(
+        default=False,
+        description="Reprocess a downloaded file even if the relevant structured "
+        "output .json file in output directory already exists.",
+    )
+    verbose: bool = Field(default=False)
+    tqdm: bool = Field(default=False, description="Display tqdm progress bar")
+    work_dir: str = Field(
+        default_factory=lambda: DEFAULT_WORK_DIR,
+        description="Where to place working files when processing each step",
+    )
+    num_processes: int = Field(
+        default=2, description="Number of parallel processes with which to process docs"
+    )
+    max_connections: Optional[int] = Field(
+        default=None, description="Limit of concurrent connectionts"
+    )
+    raise_on_error: bool = Field(
+        default=False,
+        description="Is set, will raise error if any doc in the pipeline fail. "
+        "Otherwise will log error and continue with other docs",
+    )
     disable_parallelism: bool = Field(
-        default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
+        default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
+    )
+    preserve_downloads: bool = Field(
+        default=False, description="Don't delete downloaded files after process completes"
+    )
+    download_only: bool = Field(
+        default=False, description="skip the rest of the process after files are downloaded"
+    )
+    re_download: bool = Field(
+        default=False,
+        description="If set, will re-download downloaded files "
+        "regardless of if they already exist locally",
+    )
+    uncompress: bool = Field(
+        default=False,
+        description="Uncompress any archived files. Currently supporting "
+        "zip and tar files based on file extension.",
+    )
+    iter_delete: bool = Field(
+        default=False,
+        description="If limited on memory, this can be enabled to delete "
+        "cached content as it's used and no longer needed in the pipeline.",
+    )
+    delete_cache: bool = Field(
+        default=False,
+        description="If set, will delete the cache work directory when process finishes",
     )
-    preserve_downloads: bool = False
-    download_only: bool = False
-    max_docs: Optional[int] = None
-    re_download: bool = False
-    uncompress: bool = False
     # OTEL support
     otel_endpoint: Optional[str] = Field(

unstructured_ingest/v2/logger.py CHANGED Viewed

@@ -101,7 +101,7 @@ class SensitiveFormatter(Formatter):
 def remove_root_handlers(logger: Logger) -> None:
-    # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
+    # NOTE(robinson): in some environments such as Google Colab, there is a root handler
     # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
     # Removing these when they exist prevents this behavior
     if logger.root.hasHandlers():

unstructured_ingest/v2/otel.py CHANGED Viewed

@@ -92,7 +92,7 @@ class OtelHandler:
             return None
         from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
-        logger.debug(f"Adding otel exported at {otel_endpoint}")
+        logger.debug(f"adding otel exported at {otel_endpoint}")
         trace_exporter = OTLPSpanExporter()
         processor = SimpleSpanProcessor(trace_exporter)
         provider.add_span_processor(processor)

unstructured_ingest/v2/pipeline/interfaces.py CHANGED Viewed

@@ -1,6 +1,9 @@
+from __future__ import annotations
 import asyncio
 import logging
 import multiprocessing as mp
+import shutil
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
@@ -116,10 +119,10 @@ class PipelineStep(ABC):
         iterable = iterable or []
         if iterable:
             logger.info(
-                f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+                f"calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
             )
         else:
-            logger.info(f"Calling {self.__class__.__name__} with no inputs")
+            logger.info(f"calling {self.__class__.__name__} with no inputs")
         if self.context.async_supported and self.process.is_async():
             return self.process_async(iterable=iterable)
         if self.context.mp_supported:
@@ -132,7 +135,7 @@ class PipelineStep(ABC):
     async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
         raise NotImplementedError
-    def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
+    def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
         kwargs = kwargs.copy()
         otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
         tracer = otel_handler.get_tracer()
@@ -178,6 +181,12 @@ class PipelineStep(ABC):
     def cache_dir(self) -> Path:
         return Path(self.context.work_dir) / self.identifier
+    def delete_cache(self):
+        if self.context.iter_delete and self.cache_dir.exists():
+            cache_dir = self.cache_dir
+            logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
+            shutil.rmtree(cache_dir)
 @dataclass
 class BatchPipelineStep(PipelineStep, ABC):

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -1,7 +1,11 @@
+from __future__ import annotations
 import logging
 import multiprocessing as mp
+import shutil
 from dataclasses import InitVar, dataclass, field
-from typing import Any, Optional, Union
+from pathlib import Path
+from typing import Any
 from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
 from unstructured_ingest.v2.logger import logger, make_default_logger
@@ -48,33 +52,33 @@ class Pipeline:
     partitioner: InitVar[Partitioner]
     partitioner_step: PartitionStep = field(init=False)
-    chunker: InitVar[Optional[Chunker]] = None
-    chunker_step: ChunkStep = field(init=False, default=None)
+    chunker: InitVar[Chunker | None] = None
+    chunker_step: ChunkStep | None = field(init=False, default=None)
-    embedder: InitVar[Optional[Embedder]] = None
-    embedder_step: EmbedStep = field(init=False, default=None)
+    embedder: InitVar[Embedder | None] = None
+    embedder_step: EmbedStep | None = field(init=False, default=None)
-    stager: InitVar[Optional[UploadStager]] = None
-    stager_step: UploadStageStep = field(init=False, default=None)
+    stager: InitVar[UploadStager | None] = None
+    stager_step: UploadStageStep | None = field(init=False, default=None)
     uploader: InitVar[Uploader] = field(default=LocalUploader())
-    uploader_step: UploadStep = field(init=False, default=None)
+    uploader_step: UploadStep | None = field(init=False, default=None)
-    uncompress_step: UncompressStep = field(init=False, default=None)
+    uncompress_step: UncompressStep | None = field(init=False, default=None)
-    filterer: InitVar[Optional[Filterer]] = None
-    filter_step: FilterStep = field(init=False, default=None)
+    filterer: InitVar[Filterer | None] = None
+    filter_step: FilterStep | None = field(init=False, default=None)
     def __post_init__(
         self,
         indexer: IndexerT,
         downloader: DownloaderT,
         partitioner: Partitioner,
-        chunker: Chunker = None,
-        embedder: Embedder = None,
-        stager: UploadStager = None,
-        uploader: Uploader = None,
-        filterer: Filterer = None,
+        chunker: Chunker | None = None,
+        embedder: Embedder | None = None,
+        stager: UploadStager | None = None,
+        uploader: Uploader | None = None,
+        filterer: Filterer | None = None,
     ):
         make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
         otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
@@ -113,7 +117,9 @@ class Pipeline:
             )
     def cleanup(self):
-        pass
+        if self.context.delete_cache and Path(self.context.work_dir).exists():
+            logger.info(f"deleting cache directory: {self.context.work_dir}")
+            shutil.rmtree(self.context.work_dir)
     def log_statuses(self):
         if status := self.context.status:
@@ -136,7 +142,7 @@ class Pipeline:
             if self.context.status:
                 raise PipelineError("Pipeline did not run successfully")
-    def clean_results(self, results: Optional[list[Union[Any, list[Any]]]]) -> Optional[list[Any]]:
+    def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
         if not results:
             return None
         results = [r for r in results if r]
@@ -181,7 +187,7 @@ class Pipeline:
         return filtered_records
     def _run(self):
-        logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
+        logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
         if self.context.mp_supported:
             manager = mp.Manager()
             self.context.status = manager.dict()
@@ -226,26 +232,33 @@ class Pipeline:
                 logger.info("No files to process after filtering uncompressed content, exiting")
                 return
-        if not downloaded_data:
+        if not downloaded_data or self.context.download_only:
             return
         # Partition content
         elements = self.partitioner_step(downloaded_data)
+        # Download data non longer needed, delete if possible
+        self.downloader_step.delete_cache()
         elements = self.clean_results(results=elements)
         if not elements:
             logger.info("No files to process after partitioning, exiting")
             return
         # Run element specific modifiers
-        for step in [self.chunker_step, self.embedder_step, self.stager_step]:
-            elements = step(elements) if step else elements
+        last_step = self.partitioner_step
+        for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
+            elements = step(elements)
             elements = self.clean_results(results=elements)
+            # Delete data from previous step if possible since no longer needed
+            last_step.delete_cache()
+            last_step = step
             if not elements:
-                logger.info(f"No files to process after {step.__class__.__name__}, exiting")
+                logger.info(f"no files to process after {step.__class__.__name__}, exiting")
                 return
         # Upload the final result
         self.uploader_step(iterable=elements)
+        last_step.delete_cache()
     def __str__(self):
         s = [str(self.indexer_step)]
@@ -274,12 +287,12 @@ class Pipeline:
         downloader_config: DownloaderConfigT,
         source_connection_config: ConnectionConfig,
         partitioner_config: PartitionerConfig,
-        filterer_config: FiltererConfig = None,
-        chunker_config: Optional[ChunkerConfig] = None,
-        embedder_config: Optional[EmbedderConfig] = None,
-        destination_connection_config: Optional[ConnectionConfig] = None,
-        stager_config: Optional[UploadStagerConfigT] = None,
-        uploader_config: Optional[UploaderConfigT] = None,
+        filterer_config: FiltererConfig | None = None,
+        chunker_config: ChunkerConfig | None = None,
+        embedder_config: EmbedderConfig | None = None,
+        destination_connection_config: ConnectionConfig | None = None,
+        stager_config: UploadStagerConfigT | None = None,
+        uploader_config: UploaderConfigT | None = None,
     ) -> "Pipeline":
         # Get registry key based on indexer config
         source_entry = {

unstructured_ingest/v2/pipeline/steps/chunk.py CHANGED Viewed

@@ -29,7 +29,7 @@ class ChunkStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
         if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class ChunkStep(PipelineStep):
     def _save_output(self, output_filepath: str, chunked_content: list[dict]):
         with open(str(output_filepath), "w") as f:
-            logger.debug(f"Writing chunker output to: {output_filepath}")
+            logger.debug(f"writing chunker output to: {output_filepath}")
             json.dump(chunked_content, f, indent=2)
     async def _run_async(
@@ -54,7 +54,7 @@ class ChunkStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=path)
         if not self.should_chunk(filepath=output_filepath, file_data=file_data):
-            logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
+            logger.debug(f"skipping chunking, output already exists: {output_filepath}")
             return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
         fn_kwargs = {"elements_filepath": path}
         if not asyncio.iscoroutinefunction(fn):

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import hashlib
 import json
+import shutil
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict, TypeVar
@@ -82,7 +83,7 @@ class DownloadStep(PipelineStep):
                 f"match size of local file: {file_size_bytes}, updating"
             )
             file_data.metadata.filesize_bytes = file_size_bytes
-        logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
+        logger.debug(f"updating file data with new content: {file_data.to_dict()}")
         with file_data_path.open("w") as file:
             json.dump(file_data.to_dict(), file, indent=2)
@@ -90,7 +91,7 @@ class DownloadStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         download_path = self.process.get_download_path(file_data=file_data)
         if not self.should_download(file_data=file_data, file_data_path=file_data_path):
-            logger.debug(f"Skipping download, file already exists locally: {download_path}")
+            logger.debug(f"skipping download, file already exists locally: {download_path}")
             self.update_file_data(
                 file_data=file_data,
                 file_data_path=Path(file_data_path),
@@ -185,3 +186,17 @@ class DownloadStep(PipelineStep):
         if extras:
             hashable_string += "".join(extras)
         return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
+    @property
+    def cache_dir(self) -> Path:
+        return self.process.download_config.download_dir
+    def delete_cache(self):
+        if (
+            self.context.iter_delete
+            and not self.context.preserve_downloads
+            and self.cache_dir.exists()
+        ):
+            cache_dir = self.cache_dir
+            logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
+            shutil.rmtree(cache_dir)

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -29,7 +29,7 @@ class EmbedStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     def should_embed(self, filepath: Path, file_data: FileData) -> bool:
         if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class EmbedStep(PipelineStep):
     def _save_output(self, output_filepath: str, embedded_content: list[dict]):
         with open(str(output_filepath), "w") as f:
-            logger.debug(f"Writing embedded output to: {output_filepath}")
+            logger.debug(f"writing embedded output to: {output_filepath}")
             json.dump(embedded_content, f, indent=2)
     async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
@@ -52,7 +52,7 @@ class EmbedStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=path)
         if not self.should_embed(filepath=output_filepath, file_data=file_data):
-            logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
+            logger.debug(f"skipping embedding, output already exists: {output_filepath}")
             return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
         fn_kwargs = {"elements_filepath": path}
         if not asyncio.iscoroutinefunction(fn):

unstructured_ingest/v2/pipeline/steps/filter.py CHANGED Viewed

@@ -17,7 +17,7 @@ class FilterStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
         file_data = FileData.from_file(path=file_data_path)

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -28,14 +28,14 @@ class IndexStep(PipelineStep):
             self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
-            f"Created {self.identifier} with configs: {config}, "
+            f"created {self.identifier} with configs: {config}, "
             f"connection configs: {connection_config}"
         )
     @instrument(span_name=STEP_ID)
     def run(self) -> Generator[str, None, None]:
         for file_data in self.process.run():
-            logger.debug(f"Generated file data: {file_data.to_dict()}")
+            logger.debug(f"generated file data: {file_data.to_dict()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -29,7 +29,7 @@ class PartitionStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json()
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     def should_partition(self, filepath: Path, file_data: FileData) -> bool:
         if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class PartitionStep(PipelineStep):
     def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
         with open(str(output_filepath), "w") as f:
-            logger.debug(f"Writing partitioned output to: {output_filepath}")
+            logger.debug(f"writing partitioned output to: {output_filepath}")
             json.dump(partitioned_content, f, indent=2)
     async def _run_async(
@@ -54,7 +54,7 @@ class PartitionStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=Path(file_data_path))
         if not self.should_partition(filepath=output_filepath, file_data=file_data):
-            logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
+            logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
             return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
         fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
         if not asyncio.iscoroutinefunction(fn):

unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl