PyPI - unstructured-ingest - Versions diffs - 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

unstructured-ingest 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (38) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.6" # pragma: no cover
1	+ __version__ = "0.0.8" # pragma: no cover

unstructured_ingest/v2/interfaces/processor.py CHANGED Viewed

@@ -27,9 +27,14 @@ class ProcessorConfig(BaseModel):
     re_download: bool = False
     uncompress: bool = False
+    # OTEL support
+    otel_endpoint: Optional[str] = Field(
+        default=None, description="OTEL endpoint to publish trace data to"
+    )
     # Used to keep track of state in pipeline
     status: dict = Field(default_factory=dict)
-    semaphore: Optional[Semaphore] = Field(init=False, default=None)
+    semaphore: Optional[Semaphore] = Field(init=False, default=None, exclude=True)
     def model_post_init(self, __context: Any) -> None:
         if self.max_connections is not None:

unstructured_ingest/v2/interfaces/uploader.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from abc import ABC, abstractmethod
+from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, TypeVar
@@ -31,9 +31,14 @@ class Uploader(BaseProcess, BaseConnector, ABC):
     def is_async(self) -> bool:
         return False
-    @abstractmethod
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        pass
+    def is_batch(self) -> bool:
+        return False
+    def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
+        raise NotImplementedError()
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        raise NotImplementedError()
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs)

unstructured_ingest/v2/otel.py ADDED Viewed

@@ -0,0 +1,111 @@
+import os
+from dataclasses import dataclass, field
+from typing import Callable, ClassVar, Optional, Protocol, Sequence
+from opentelemetry import trace
+from opentelemetry.context import attach, get_current
+from opentelemetry.propagate import extract, inject
+from opentelemetry.sdk.resources import SERVICE_NAME, Resource
+from opentelemetry.sdk.trace import ReadableSpan, Tracer, TracerProvider
+from opentelemetry.sdk.trace.export import (
+    ConsoleSpanExporter,
+    SimpleSpanProcessor,
+    SpanExportResult,
+)
+from unstructured_ingest.v2.logger import logger
+class AddTraceCallable(Protocol):
+    def __call__(self, provider: TracerProvider) -> None:
+        pass
+class LogSpanExporter(ConsoleSpanExporter):
+    def __init__(self, log_out: Callable = logger.info, **kwargs):
+        self.log_out = log_out
+        super().__init__(**kwargs)
+    def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
+        for span in spans:
+            self.log_out(self.formatter(span))
+        return SpanExportResult.SUCCESS
+@dataclass
+class OtelHandler:
+    otel_endpoint: Optional[str] = None
+    service_name: str = "unstructured-ingest"
+    trace_provider: TracerProvider = field(init=False)
+    log_out: Callable = field(default=logger.info)
+    trace_context_key: ClassVar[str] = "_trace_context"
+    def init_trace(self):
+        # Should only be done once
+        resource = Resource(attributes={SERVICE_NAME: self.service_name})
+        trace_provider = self.init_trace_provider(resource=resource)
+        trace.set_tracer_provider(trace_provider)
+    @staticmethod
+    def set_attributes(span, attributes_dict):
+        if attributes_dict:
+            for att in attributes_dict:
+                span.set_attribute(att, attributes_dict[att])
+    @staticmethod
+    def inject_context() -> dict:
+        trace_context = {}
+        current_context = get_current()
+        inject(trace_context, current_context)
+        return trace_context
+    @staticmethod
+    def attach_context(trace_context: dict) -> object:
+        extracted_context = extract(trace_context)
+        return attach(extracted_context)
+    def get_otel_endpoint(self) -> Optional[str]:
+        if otel_endpoint := self.otel_endpoint:
+            return otel_endpoint
+        if otlp_endpoint := os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
+            return otlp_endpoint
+        if otlp_traces_endpoint := os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
+            return otlp_traces_endpoint
+        return None
+    def _add_console_trace_processor(self, provider: TracerProvider) -> None:
+        def custom_formatter(span: ReadableSpan) -> str:
+            duration = (span.end_time - span.start_time) / 1e9
+            s = f"{span.name} finished in {duration}s"
+            if span.attributes:
+                attributes_str = ", ".join([f"{k}={v}" for k, v in span.attributes.items()])
+                s += f", attributes: {attributes_str}"
+            return s
+        tracer_exporter = LogSpanExporter(formatter=custom_formatter, log_out=self.log_out)
+        processor = SimpleSpanProcessor(tracer_exporter)
+        provider.add_span_processor(span_processor=processor)
+    def _add_otel_trace_processor(self, provider: TracerProvider) -> None:
+        otel_endpoint = self.get_otel_endpoint()
+        if not otel_endpoint:
+            return None
+        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+        logger.debug(f"Adding otel exported at {otel_endpoint}")
+        trace_exporter = OTLPSpanExporter()
+        processor = SimpleSpanProcessor(trace_exporter)
+        provider.add_span_processor(processor)
+    def init_trace_provider(self, resource: Resource) -> TracerProvider:
+        trace_provider = TracerProvider(resource=resource)
+        add_fns: list[AddTraceCallable] = [
+            self._add_otel_trace_processor,
+            self._add_console_trace_processor,
+        ]
+        for add_fn in add_fns:
+            add_fn(provider=trace_provider)
+        return trace_provider
+    def get_tracer(self) -> Tracer:
+        return trace.get_tracer(self.service_name)

unstructured_ingest/v2/pipeline/interfaces.py CHANGED Viewed

@@ -1,40 +1,24 @@
 import asyncio
 import logging
 import multiprocessing as mp
-from abc import ABC
+from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from functools import wraps
 from pathlib import Path
-from time import time
 from typing import Any, Awaitable, Callable, Optional, TypeVar
 from tqdm import tqdm
 from tqdm.asyncio import tqdm as tqdm_asyncio
-from unstructured_ingest.v2.interfaces import BaseProcess, ProcessorConfig
+from unstructured_ingest.v2.interfaces import BaseProcess, ProcessorConfig, Uploader
 from unstructured_ingest.v2.logger import logger, make_default_logger
+from unstructured_ingest.v2.otel import OtelHandler
+from unstructured_ingest.v2.pipeline.otel import instrument
 BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
 iterable_input = list[dict[str, Any]]
-def timed(func):
-    @wraps(func)
-    def time_it(self, *args, **kwargs):
-        start = time()
-        try:
-            return func(self, *args, **kwargs)
-        finally:
-            if func.__name__ == "__call__":
-                reported_name = f"{self.__class__.__name__} [cls]"
-            else:
-                reported_name = func.__name__
-            logger.info(f"{reported_name} took {time() - start} seconds")
-    return time_it
 @dataclass
 class PipelineStep(ABC):
     process: BaseProcessT
@@ -97,9 +81,15 @@ class PipelineStep(ABC):
                 return self.process_serially(iterable)
             with mp.Pool(
                 processes=self.context.num_processes,
-                initializer=self._init_logger,
-                initargs=(logging.DEBUG if self.context.verbose else logging.INFO,),
+                initializer=self._init_mp,
+                initargs=(
+                    logging.DEBUG if self.context.verbose else logging.INFO,
+                    self.context.otel_endpoint,
+                ),
             ) as pool:
+                otel_context = OtelHandler.inject_context()
+                for iter in iterable:
+                    iter[OtelHandler.trace_context_key] = otel_context
                 if self.context.tqdm:
                     return list(
                         tqdm(
@@ -115,11 +105,13 @@ class PipelineStep(ABC):
         # Allow mapping of kwargs via multiprocessing map()
         return self.run(**input_kwargs)
-    def _init_logger(self, log_level: int):
+    def _init_mp(self, log_level: int, endpoint: Optional[str] = None) -> None:
         # Init logger for each spawned process when using multiprocessing pool
         make_default_logger(level=log_level)
+        otel_handler = OtelHandler(otel_endpoint=endpoint, log_out=logger.debug)
+        otel_handler.init_trace()
-    @timed
+    @instrument()
     def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
         iterable = iterable or []
         if iterable:
@@ -141,9 +133,19 @@ class PipelineStep(ABC):
         raise NotImplementedError
     def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
+        kwargs = kwargs.copy()
+        otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
+        tracer = otel_handler.get_tracer()
+        if trace_context := kwargs.pop(otel_handler.trace_context_key, {}):
+            otel_handler.attach_context(trace_context=trace_context)
+        attributes = {}
+        if file_data_path := kwargs.get("file_data_path"):
+            attributes["file_id"] = Path(file_data_path).stem
         try:
-            fn = _fn or self.process.run
-            return self._run(fn=fn, **kwargs)
+            with tracer.start_as_current_span(self.identifier, record_exception=True) as span:
+                otel_handler.set_attributes(span, attributes)
+                fn = _fn or self.process.run
+                return self._run(fn=fn, **kwargs)
         except Exception as e:
             logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
             if "file_data_path" in kwargs:
@@ -153,9 +155,17 @@ class PipelineStep(ABC):
             return None
     async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
+        otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
         try:
-            fn = _fn or self.process.run_async
-            return await self._run_async(fn=fn, **kwargs)
+            attributes = {}
+            if file_data_path := kwargs.get("file_data_path"):
+                attributes["file_id"] = Path(file_data_path).stem
+            with otel_handler.get_tracer().start_as_current_span(
+                self.identifier, record_exception=True
+            ) as span:
+                otel_handler.set_attributes(span, attributes)
+                fn = _fn or self.process.run_async
+                return await self._run_async(fn=fn, **kwargs)
         except Exception as e:
             logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
             if "file_data_path" in kwargs:
@@ -167,3 +177,26 @@ class PipelineStep(ABC):
     @property
     def cache_dir(self) -> Path:
         return Path(self.context.work_dir) / self.identifier
+@dataclass
+class BatchPipelineStep(PipelineStep, ABC):
+    process: Uploader
+    def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
+        if self.context.mp_supported and self.process.is_batch():
+            return self.run_batch(contents=iterable)
+        super().__call__(iterable=iterable)
+    @abstractmethod
+    def _run_batch(self, contents: iterable_input, **kwargs) -> Any:
+        pass
+    def run_batch(self, contents: iterable_input, **kwargs) -> Any:
+        try:
+            return self._run_batch(contents=contents, **kwargs)
+        except Exception as e:
+            self.context.status[self.identifier] = {"step_error": str(e)}
+            if self.context.raise_on_error:
+                raise e
+            return None

unstructured_ingest/v2/pipeline/otel.py ADDED Viewed

@@ -0,0 +1,32 @@
+from functools import wraps
+from typing import Callable, Optional
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.otel import OtelHandler
+def instrument(
+    span_name: Optional[str] = None,
+    record_exception: bool = True,
+    attributes: dict[str, str] = None,
+    log_out: Callable = logger.info,
+) -> Callable[[Callable], Callable]:
+    def span_decorator(func: Callable) -> Callable:
+        def get_name(self) -> str:
+            if span_name:
+                return span_name
+            return f"{self.identifier} step"
+        @wraps(func)
+        def wrap_with_span(self, *args, **kwargs):
+            name = get_name(self=self)
+            otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=log_out)
+            with otel_handler.get_tracer().start_as_current_span(
+                name, record_exception=record_exception
+            ) as span:
+                otel_handler.set_attributes(span, attributes)
+                return func(self, *args, **kwargs)
+        return wrap_with_span
+    return span_decorator

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import logging
 import multiprocessing as mp
 from dataclasses import InitVar, dataclass, field
-from time import time
 from typing import Any, Optional, Union
-from unstructured_ingest.v2.interfaces import ProcessorConfig
+from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
 from unstructured_ingest.v2.logger import logger, make_default_logger
+from unstructured_ingest.v2.otel import OtelHandler
 from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
 from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
 from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
@@ -14,7 +14,7 @@ from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
 from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
 from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
 from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
-from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
+from unstructured_ingest.v2.pipeline.steps.upload import UploadStep
 from unstructured_ingest.v2.processes.chunker import ChunkerConfig
 from unstructured_ingest.v2.processes.connector_registry import (
     ConnectionConfig,
@@ -77,6 +77,8 @@ class Pipeline:
         filterer: Filterer = None,
     ):
         make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
+        otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
+        otel_handler.init_trace()
         self.indexer_step = IndexStep(process=indexer, context=self.context)
         self.downloader_step = DownloadStep(process=downloader, context=self.context)
         self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
@@ -121,11 +123,13 @@ class Pipeline:
                     logger.error(f"{k}: [{kk}] {vv}")
     def run(self):
+        otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
         try:
-            start_time = time()
-            self._run_prechecks()
-            self._run()
-            logger.info(f"Finished ingest process in {time() - start_time}s")
+            with otel_handler.get_tracer().start_as_current_span(
+                "ingest process", record_exception=True
+            ):
+                self._run_prechecks()
+                self._run()
         finally:
             self.log_statuses()
             self.cleanup()

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Generator, Optional, TypeVar
 from unstructured_ingest.v2.interfaces.indexer import Indexer
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
+from unstructured_ingest.v2.pipeline.otel import instrument
 from unstructured_ingest.v2.utils import serialize_base_model_json
 IndexerT = TypeVar("IndexerT", bound=Indexer)
@@ -31,6 +32,7 @@ class IndexStep(PipelineStep):
             f"connection configs: {connection_config}"
         )
+    @instrument(span_name=STEP_ID)
     def run(self) -> Generator[str, None, None]:
         for file_data in self.process.run():
             logger.debug(f"Generated file data: {file_data.to_dict()}")

unstructured_ingest/v2/pipeline/steps/upload.py CHANGED Viewed

@@ -4,9 +4,10 @@ from pathlib import Path
 from typing import Callable, Optional, TypedDict
 from unstructured_ingest.v2.interfaces import FileData
-from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
+from unstructured_ingest.v2.interfaces.uploader import UploadContent
 from unstructured_ingest.v2.logger import logger
-from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
+from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
+from unstructured_ingest.v2.pipeline.otel import instrument
 STEP_ID = "upload"
@@ -17,8 +18,7 @@ class UploadStepContent(TypedDict):
 @dataclass
-class UploadStep(PipelineStep):
-    process: Uploader
+class UploadStep(BatchPipelineStep):
     identifier: str = STEP_ID
     def __str__(self):
@@ -34,25 +34,13 @@ class UploadStep(PipelineStep):
             f"connection configs: {connection_config}"
         )
-    def process_whole(self, iterable: iterable_input):
-        self.run(contents=iterable)
-    @timed
-    def __call__(self, iterable: iterable_input):
-        logger.info(
-            f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
-        )
-        if self.process.is_async():
-            self.process_async(iterable=iterable)
-        else:
-            self.process_whole(iterable=iterable)
-    def _run(self, fn: Callable, contents: list[UploadStepContent]):
+    @instrument(span_name=STEP_ID)
+    def _run_batch(self, contents: list[UploadStepContent]) -> None:
         upload_contents = [
             UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"]))
             for c in contents
         ]
-        fn(contents=upload_contents)
+        self.process.run_batch(contents=upload_contents)
     async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
         fn = fn or self.process.run_async

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -14,7 +14,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -139,13 +138,9 @@ class AstraDBUploader(Uploader):
         )
         return astra_db_collection
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        elements_dict = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements = json.load(elements_file)
-                elements_dict.extend(elements)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         logger.info(
             f"writing {len(elements_dict)} objects to destination "
             f"collection {self.upload_config.collection_name}"

unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py CHANGED Viewed

@@ -12,7 +12,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
-    UploadContent,
+    FileData,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -192,14 +192,9 @@ class AzureCognitiveSearchUploader(Uploader):
     def write_dict_wrapper(self, elements_dict):
         return self.write_dict(elements_dict=elements_dict)
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        elements_dict = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements = json.load(elements_file)
-                elements_dict.extend(elements)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         logger.info(
             f"writing document batches to destination"
             f" endpoint at {str(self.connection_config.endpoint)}"

unstructured_ingest/v2/processes/connectors/chroma.py CHANGED Viewed

@@ -15,7 +15,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -186,13 +185,9 @@ class ChromaUploader(Uploader):
         )
         return chroma_dict
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        elements_dict = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements = json.load(elements_file)
-                elements_dict.extend(elements)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         logger.info(
             f"writing {len(elements_dict)} objects to destination "

unstructured_ingest/v2/processes/connectors/couchbase.py CHANGED Viewed

@@ -26,7 +26,6 @@ from unstructured_ingest.v2.interfaces import (
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -134,14 +133,11 @@ class CouchbaseUploader(Uploader):
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        elements = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements.extend(json.load(elements_file))
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         logger.info(
-            f"writing {len(elements)} objects to destination "
+            f"writing {len(elements_dict)} objects to destination "
             f"bucket, {self.connection_config.bucket} "
             f"at {self.connection_config.connection_string}",
         )
@@ -150,7 +146,7 @@ class CouchbaseUploader(Uploader):
         scope = bucket.scope(self.connection_config.scope)
         collection = scope.collection(self.connection_config.collection)
-        for chunk in batch_generator(elements, self.upload_config.batch_size):
+        for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
             collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})

unstructured_ingest/v2/processes/connectors/databricks_volumes.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from dataclasses import dataclass
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional
 from pydantic import Field, Secret
@@ -9,7 +10,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
-    UploadContent,
+    FileData,
     Uploader,
     UploaderConfig,
 )
@@ -142,15 +143,13 @@ class DatabricksVolumesUploader(Uploader):
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        for content in contents:
-            with open(content.path, "rb") as elements_file:
-                output_path = os.path.join(self.upload_config.path, content.path.name)
-                self.get_client().files.upload(
-                    file_path=output_path,
-                    contents=elements_file,
-                    overwrite=self.upload_config.overwrite,
-                )
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        output_path = os.path.join(self.upload_config.path, path.name)
+        self.get_client().files.upload(
+            file_path=output_path,
+            contents=path,
+            overwrite=self.upload_config.overwrite,
+        )
 databricks_volumes_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/elasticsearch.py CHANGED Viewed

@@ -26,7 +26,6 @@ from unstructured_ingest.v2.interfaces import (
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -384,14 +383,12 @@ class ElasticsearchUploader(Uploader):
         return parallel_bulk
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         parallel_bulk = self.load_parallel_bulk()
-        elements_dict = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements = json.load(elements_file)
-                elements_dict.extend(elements)
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
         logger.info(
             f"writing {len(elements_dict)} elements via document batches to destination "
             f"index named {self.upload_config.index_name} at {upload_destination} with "

unstructured_ingest/v2/processes/connectors/fsspec/azure.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -152,8 +152,8 @@ class AzureUploader(FsspecUploader):
         super().precheck()
     @requires_dependencies(["adlfs", "fsspec"], extras="azure")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        return super().run(contents=contents, **kwargs)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        return super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["adlfs", "fsspec"], extras="azure")
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured-ingest 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl