PyPI - unstructured-ingest - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

unstructured-ingest 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (70) hide show

unstructured_ingest/utils/string_and_date_utils.py CHANGED Viewed

@@ -10,13 +10,13 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
     try:
         return json.loads(json_string)
     except json.JSONDecodeError:
-        # Not neccessary an error if it is a path or malformed json
+        # Not necessary an error if it is a path or malformed json
         pass
     try:
         # This is common when single quotes are used instead of double quotes
         return json.loads(json_string.replace("'", '"'))
     except json.JSONDecodeError:
-        # Not neccessary an error if it is a path
+        # Not necessary an error if it is a path
         pass
     return json_string

unstructured_ingest/v2/cli/base/cmd.py CHANGED Viewed

@@ -102,7 +102,7 @@ class BaseCmd(ABC):
         cmd.params.extend(options)
         return cmd
-    def get_pipline(
+    def get_pipeline(
         self,
         src: str,
         source_options: dict[str, Any],
@@ -122,7 +122,7 @@ class BaseCmd(ABC):
             pipeline_kwargs["chunker"] = chunker
         if filterer := self.get_filterer(options=source_options):
             pipeline_kwargs["filterer"] = filterer
-        if embedder := self.get_embeder(options=source_options):
+        if embedder := self.get_embedder(options=source_options):
             pipeline_kwargs["embedder"] = embedder
         if dest:
             logger.debug(
@@ -160,7 +160,7 @@ class BaseCmd(ABC):
         return Filterer(config=filterer_configs)
     @staticmethod
-    def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
+    def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
         embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
         if not embedder_config.embedding_provider:
             return None

unstructured_ingest/v2/cli/base/dest.py CHANGED Viewed

@@ -40,7 +40,7 @@ class DestCmd(BaseCmd):
         source_options: dict = ctx.parent.params if ctx.parent else {}
         conform_click_options(options)
         try:
-            pipeline = self.get_pipline(
+            pipeline = self.get_pipeline(
                 src=source_cmd,
                 source_options=source_options,
                 dest=self.cmd_name,

unstructured_ingest/v2/cli/base/src.py CHANGED Viewed

@@ -55,7 +55,7 @@ class SrcCmd(BaseCmd):
         conform_click_options(options)
         logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
         try:
-            pipeline = self.get_pipline(src=self.cmd_name, source_options=options)
+            pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
             pipeline.run()
         except Exception as e:
             logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)

unstructured_ingest/v2/cli/utils/click.py CHANGED Viewed

@@ -67,7 +67,7 @@ class FileOrJson(click.ParamType):
                     return value
         self.fail(
             gettext(
-                "{value} is not a valid json string nor an existing filepath.",
+                "{value} is neither a valid json string nor an existing filepath.",
             ).format(value=value),
             param,
             ctx,

unstructured_ingest/v2/interfaces/processor.py CHANGED Viewed

@@ -11,21 +11,56 @@ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pi
 class ProcessorConfig(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    reprocess: bool = False
-    verbose: bool = False
-    tqdm: bool = False
-    work_dir: str = Field(default_factory=lambda: DEFAULT_WORK_DIR)
-    num_processes: int = 2
-    max_connections: Optional[int] = None
-    raise_on_error: bool = False
+    reprocess: bool = Field(
+        default=False,
+        description="Reprocess a downloaded file even if the relevant structured "
+        "output .json file in output directory already exists.",
+    )
+    verbose: bool = Field(default=False)
+    tqdm: bool = Field(default=False, description="Display tqdm progress bar")
+    work_dir: str = Field(
+        default_factory=lambda: DEFAULT_WORK_DIR,
+        description="Where to place working files when processing each step",
+    )
+    num_processes: int = Field(
+        default=2, description="Number of parallel processes with which to process docs"
+    )
+    max_connections: Optional[int] = Field(
+        default=None, description="Limit of concurrent connectionts"
+    )
+    raise_on_error: bool = Field(
+        default=False,
+        description="Is set, will raise error if any doc in the pipeline fail. "
+        "Otherwise will log error and continue with other docs",
+    )
     disable_parallelism: bool = Field(
-        default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
+        default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
+    )
+    preserve_downloads: bool = Field(
+        default=False, description="Don't delete downloaded files after process completes"
+    )
+    download_only: bool = Field(
+        default=False, description="skip the rest of the process after files are downloaded"
+    )
+    re_download: bool = Field(
+        default=False,
+        description="If set, will re-download downloaded files "
+        "regardless of if they already exist locally",
+    )
+    uncompress: bool = Field(
+        default=False,
+        description="Uncompress any archived files. Currently supporting "
+        "zip and tar files based on file extension.",
+    )
+    iter_delete: bool = Field(
+        default=False,
+        description="If limited on memory, this can be enabled to delete "
+        "cached content as it's used and no longer needed in the pipeline.",
+    )
+    delete_cache: bool = Field(
+        default=False,
+        description="If set, will delete the cache work directory when process finishes",
     )
-    preserve_downloads: bool = False
-    download_only: bool = False
-    max_docs: Optional[int] = None
-    re_download: bool = False
-    uncompress: bool = False
     # OTEL support
     otel_endpoint: Optional[str] = Field(

unstructured_ingest/v2/logger.py CHANGED Viewed

@@ -101,7 +101,7 @@ class SensitiveFormatter(Formatter):
 def remove_root_handlers(logger: Logger) -> None:
-    # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
+    # NOTE(robinson): in some environments such as Google Colab, there is a root handler
     # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
     # Removing these when they exist prevents this behavior
     if logger.root.hasHandlers():

unstructured_ingest/v2/otel.py CHANGED Viewed

@@ -92,7 +92,7 @@ class OtelHandler:
             return None
         from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
-        logger.debug(f"Adding otel exported at {otel_endpoint}")
+        logger.debug(f"adding otel exported at {otel_endpoint}")
         trace_exporter = OTLPSpanExporter()
         processor = SimpleSpanProcessor(trace_exporter)
         provider.add_span_processor(processor)

unstructured_ingest/v2/pipeline/interfaces.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import asyncio
 import logging
 import multiprocessing as mp
+import shutil
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
@@ -118,10 +119,10 @@ class PipelineStep(ABC):
         iterable = iterable or []
         if iterable:
             logger.info(
-                f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+                f"calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
             )
         else:
-            logger.info(f"Calling {self.__class__.__name__} with no inputs")
+            logger.info(f"calling {self.__class__.__name__} with no inputs")
         if self.context.async_supported and self.process.is_async():
             return self.process_async(iterable=iterable)
         if self.context.mp_supported:
@@ -180,6 +181,12 @@ class PipelineStep(ABC):
     def cache_dir(self) -> Path:
         return Path(self.context.work_dir) / self.identifier
+    def delete_cache(self):
+        if self.context.iter_delete and self.cache_dir.exists():
+            cache_dir = self.cache_dir
+            logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
+            shutil.rmtree(cache_dir)
 @dataclass
 class BatchPipelineStep(PipelineStep, ABC):

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -2,7 +2,9 @@ from __future__ import annotations
 import logging
 import multiprocessing as mp
+import shutil
 from dataclasses import InitVar, dataclass, field
+from pathlib import Path
 from typing import Any
 from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
@@ -115,7 +117,9 @@ class Pipeline:
             )
     def cleanup(self):
-        pass
+        if self.context.delete_cache and Path(self.context.work_dir).exists():
+            logger.info(f"deleting cache directory: {self.context.work_dir}")
+            shutil.rmtree(self.context.work_dir)
     def log_statuses(self):
         if status := self.context.status:
@@ -183,7 +187,7 @@ class Pipeline:
         return filtered_records
     def _run(self):
-        logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
+        logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
         if self.context.mp_supported:
             manager = mp.Manager()
             self.context.status = manager.dict()
@@ -228,26 +232,33 @@ class Pipeline:
                 logger.info("No files to process after filtering uncompressed content, exiting")
                 return
-        if not downloaded_data:
+        if not downloaded_data or self.context.download_only:
             return
         # Partition content
         elements = self.partitioner_step(downloaded_data)
+        # Download data non longer needed, delete if possible
+        self.downloader_step.delete_cache()
         elements = self.clean_results(results=elements)
         if not elements:
             logger.info("No files to process after partitioning, exiting")
             return
         # Run element specific modifiers
-        for step in [self.chunker_step, self.embedder_step, self.stager_step]:
-            elements = step(elements) if step else elements
+        last_step = self.partitioner_step
+        for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
+            elements = step(elements)
             elements = self.clean_results(results=elements)
+            # Delete data from previous step if possible since no longer needed
+            last_step.delete_cache()
+            last_step = step
             if not elements:
-                logger.info(f"No files to process after {step.__class__.__name__}, exiting")
+                logger.info(f"no files to process after {step.__class__.__name__}, exiting")
                 return
         # Upload the final result
         self.uploader_step(iterable=elements)
+        last_step.delete_cache()
     def __str__(self):
         s = [str(self.indexer_step)]

unstructured_ingest/v2/pipeline/steps/chunk.py CHANGED Viewed

@@ -29,7 +29,7 @@ class ChunkStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
         if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class ChunkStep(PipelineStep):
     def _save_output(self, output_filepath: str, chunked_content: list[dict]):
         with open(str(output_filepath), "w") as f:
-            logger.debug(f"Writing chunker output to: {output_filepath}")
+            logger.debug(f"writing chunker output to: {output_filepath}")
             json.dump(chunked_content, f, indent=2)
     async def _run_async(
@@ -54,7 +54,7 @@ class ChunkStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=path)
         if not self.should_chunk(filepath=output_filepath, file_data=file_data):
-            logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
+            logger.debug(f"skipping chunking, output already exists: {output_filepath}")
             return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
         fn_kwargs = {"elements_filepath": path}
         if not asyncio.iscoroutinefunction(fn):

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import hashlib
 import json
+import shutil
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict, TypeVar
@@ -82,7 +83,7 @@ class DownloadStep(PipelineStep):
                 f"match size of local file: {file_size_bytes}, updating"
             )
             file_data.metadata.filesize_bytes = file_size_bytes
-        logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
+        logger.debug(f"updating file data with new content: {file_data.to_dict()}")
         with file_data_path.open("w") as file:
             json.dump(file_data.to_dict(), file, indent=2)
@@ -90,7 +91,7 @@ class DownloadStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         download_path = self.process.get_download_path(file_data=file_data)
         if not self.should_download(file_data=file_data, file_data_path=file_data_path):
-            logger.debug(f"Skipping download, file already exists locally: {download_path}")
+            logger.debug(f"skipping download, file already exists locally: {download_path}")
             self.update_file_data(
                 file_data=file_data,
                 file_data_path=Path(file_data_path),
@@ -185,3 +186,17 @@ class DownloadStep(PipelineStep):
         if extras:
             hashable_string += "".join(extras)
         return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
+    @property
+    def cache_dir(self) -> Path:
+        return self.process.download_config.download_dir
+    def delete_cache(self):
+        if (
+            self.context.iter_delete
+            and not self.context.preserve_downloads
+            and self.cache_dir.exists()
+        ):
+            cache_dir = self.cache_dir
+            logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
+            shutil.rmtree(cache_dir)

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -29,7 +29,7 @@ class EmbedStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     def should_embed(self, filepath: Path, file_data: FileData) -> bool:
         if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class EmbedStep(PipelineStep):
     def _save_output(self, output_filepath: str, embedded_content: list[dict]):
         with open(str(output_filepath), "w") as f:
-            logger.debug(f"Writing embedded output to: {output_filepath}")
+            logger.debug(f"writing embedded output to: {output_filepath}")
             json.dump(embedded_content, f, indent=2)
     async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
@@ -52,7 +52,7 @@ class EmbedStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=path)
         if not self.should_embed(filepath=output_filepath, file_data=file_data):
-            logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
+            logger.debug(f"skipping embedding, output already exists: {output_filepath}")
             return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
         fn_kwargs = {"elements_filepath": path}
         if not asyncio.iscoroutinefunction(fn):

unstructured_ingest/v2/pipeline/steps/filter.py CHANGED Viewed

@@ -17,7 +17,7 @@ class FilterStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
         file_data = FileData.from_file(path=file_data_path)

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -28,14 +28,14 @@ class IndexStep(PipelineStep):
             self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
-            f"Created {self.identifier} with configs: {config}, "
+            f"created {self.identifier} with configs: {config}, "
             f"connection configs: {connection_config}"
         )
     @instrument(span_name=STEP_ID)
     def run(self) -> Generator[str, None, None]:
         for file_data in self.process.run():
-            logger.debug(f"Generated file data: {file_data.to_dict()}")
+            logger.debug(f"generated file data: {file_data.to_dict()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -29,7 +29,7 @@ class PartitionStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json()
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     def should_partition(self, filepath: Path, file_data: FileData) -> bool:
         if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class PartitionStep(PipelineStep):
     def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
         with open(str(output_filepath), "w") as f:
-            logger.debug(f"Writing partitioned output to: {output_filepath}")
+            logger.debug(f"writing partitioned output to: {output_filepath}")
             json.dump(partitioned_content, f, indent=2)
     async def _run_async(
@@ -54,7 +54,7 @@ class PartitionStep(PipelineStep):
         file_data = FileData.from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=Path(file_data_path))
         if not self.should_partition(filepath=output_filepath, file_data=file_data):
-            logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
+            logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
             return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
         fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
         if not asyncio.iscoroutinefunction(fn):

unstructured_ingest/v2/pipeline/steps/stage.py CHANGED Viewed

@@ -31,7 +31,7 @@ class UploadStageStep(PipelineStep):
             self.process.upload_stager_config.json() if self.process.upload_stager_config else None
         )
         self.cache_dir.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str

unstructured_ingest/v2/pipeline/steps/uncompress.py CHANGED Viewed

@@ -23,7 +23,7 @@ class UncompressStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -6,6 +6,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
     add_source_entry,
 )
+from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
+from .airtable import airtable_source_entry
 from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
 from .astradb import astra_db_destination_entry
 from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
@@ -92,3 +94,4 @@ add_destination_entry(
 )
 add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
+add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)

unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl