PyPI - unstructured-ingest - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (64) hide show

test/integration/connectors/utils/{validation.py → validation/source.py} RENAMED Viewed

@@ -1,83 +1,28 @@
-import filecmp
 import json
 import os
 import shutil
-from dataclasses import dataclass, field, replace
+from dataclasses import replace
 from pathlib import Path
 from typing import Callable, Optional
-import pandas as pd
-from bs4 import BeautifulSoup
 from deepdiff import DeepDiff
+from pydantic import Field
-from test.integration.connectors.utils.constants import expected_results_path
+from test.integration.connectors.utils.validation.utils import ValidationConfig, reset_dir
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
-def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    expected_df = pd.read_csv(expected_filepath)
-    current_df = pd.read_csv(current_filepath)
-    if expected_df.equals(current_df):
-        return True
-    # Print diff
-    diff = expected_df.merge(current_df, indicator=True, how="left").loc[
-        lambda x: x["_merge"] != "both"
-    ]
-    print("diff between expected and current df:")
-    print(diff)
-    return False
-def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as expected_f:
-        expected_soup = BeautifulSoup(expected_f, "html.parser")
-    with current_filepath.open() as current_f:
-        current_soup = BeautifulSoup(current_f, "html.parser")
-    return expected_soup.text == current_soup.text
-def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as expected_f:
-        expected_text_lines = expected_f.readlines()
-    with current_filepath.open() as current_f:
-        current_text_lines = current_f.readlines()
-    if len(expected_text_lines) != len(current_text_lines):
-        print(
-            f"Lines in expected text file ({len(expected_text_lines)}) "
-            f"don't match current text file ({len(current_text_lines)})"
-        )
-        return False
-    expected_text = "\n".join(expected_text_lines)
-    current_text = "\n".join(current_text_lines)
-    if expected_text == current_text:
-        return True
-    print("txt content don't match:")
-    print(f"expected: {expected_text}")
-    print(f"current: {current_text}")
-    return False
-file_type_equality_check = {
-    ".json": json_equality_check,
-    ".html": html_equality_check,
-    ".txt": txt_equality_check,
-}
-@dataclass
-class ValidationConfigs:
-    test_id: str
+class SourceValidationConfigs(ValidationConfig):
     expected_number_indexed_file_data: Optional[int] = None
     expected_num_files: Optional[int] = None
     predownload_file_data_check: Optional[Callable[[FileData], None]] = None
     postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
-    exclude_fields: list[str] = field(
+    exclude_fields: list[str] = Field(
         default_factory=lambda: ["local_download_path", "metadata.date_processed"]
     )
-    exclude_fields_extend: list[str] = field(default_factory=list)
+    exclude_fields_extend: list[str] = Field(default_factory=list)
     validate_downloaded_files: bool = False
     validate_file_data: bool = True
-    downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
     def get_exclude_fields(self) -> list[str]:
         exclude_fields = self.exclude_fields
@@ -97,9 +42,6 @@ class ValidationConfigs:
             downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
             assert len(downloaded_files) == expected_num_files
-    def test_output_dir(self) -> Path:
-        return expected_results_path / self.test_id
     def omit_ignored_fields(self, data: dict) -> dict:
         exclude_fields = self.get_exclude_fields()
         # Ignore fields that dynamically change every time the tests run
@@ -143,7 +85,7 @@ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
 def check_contents(
-    expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
+    expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
 ):
     found_diff = False
     for file_data in all_file_data:
@@ -160,27 +102,10 @@ def check_contents(
     assert not found_diff, f"Diffs found between files: {found_diff}"
-def detect_diff(
-    configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
-) -> bool:
-    if expected_filepath.suffix != current_filepath.suffix:
-        return True
-    if downloaded_file_equality_check := configs.downloaded_file_equality_check:
-        return not downloaded_file_equality_check(expected_filepath, current_filepath)
-    current_suffix = expected_filepath.suffix
-    if current_suffix in file_type_equality_check:
-        equality_check_callable = file_type_equality_check[current_suffix]
-        return not equality_check_callable(
-            expected_filepath=expected_filepath, current_filepath=current_filepath
-        )
-    # Fallback is using filecmp.cmp to compare the files
-    return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
 def check_raw_file_contents(
     expected_output_dir: Path,
     current_output_dir: Path,
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
 ):
     current_files = get_files(dir_path=current_output_dir)
     found_diff = False
@@ -188,7 +113,7 @@ def check_raw_file_contents(
     for current_file in current_files:
         current_file_path = current_output_dir / current_file
         expected_file_path = expected_output_dir / current_file
-        if detect_diff(configs, expected_file_path, current_file_path):
+        if configs.detect_diff(expected_file_path, current_file_path):
             found_diff = True
             files.append(str(expected_file_path))
             print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -196,7 +121,7 @@ def check_raw_file_contents(
 def run_expected_results_validation(
-    expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
+    expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
 ):
     check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
     check_contents(
@@ -207,7 +132,7 @@ def run_expected_results_validation(
 def run_expected_download_files_validation(
     expected_output_dir: Path,
     current_download_dir: Path,
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
 ):
     check_files_in_paths(
         expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
@@ -234,12 +159,10 @@ def update_fixtures(
     save_downloads: bool = False,
     save_filedata: bool = True,
 ):
-    # Delete current files
-    shutil.rmtree(path=output_dir, ignore_errors=True)
-    output_dir.mkdir(parents=True)
     # Rewrite the current file data
     if save_filedata:
         file_data_output_path = output_dir / "file_data"
+        reset_dir(dir_path=file_data_output_path)
         print(
             f"Writing {len(all_file_data)} file data to "
             f"saved fixture location {file_data_output_path}"
@@ -260,6 +183,7 @@ def update_fixtures(
     # If applicable, save raw downloads
     if save_downloads:
         raw_download_output_path = output_dir / "downloads"
+        reset_dir(raw_download_output_path)
         print(
             f"Writing {len(download_files)} downloaded files to "
             f"saved fixture location {raw_download_output_path}"
@@ -268,7 +192,7 @@ def update_fixtures(
 def run_all_validations(
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
     predownload_file_data: list[FileData],
     postdownload_file_data: list[FileData],
     download_dir: Path,
@@ -308,7 +232,7 @@ def run_all_validations(
 async def source_connector_validation(
     indexer: Indexer,
     downloader: Downloader,
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
     overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
 ) -> None:
     # Run common validations on the process of running a source connector, supporting dynamic

test/integration/connectors/utils/validation/utils.py ADDED Viewed

@@ -0,0 +1,36 @@
+import filecmp
+import shutil
+from pathlib import Path
+from typing import Callable, Optional
+from pydantic import BaseModel
+from test.integration.connectors.utils.constants import expected_results_path
+from test.integration.connectors.utils.validation.equality import file_type_equality_check
+class ValidationConfig(BaseModel):
+    test_id: str
+    file_equality_check: Optional[Callable[[Path, Path], bool]] = None
+    def test_output_dir(self) -> Path:
+        return expected_results_path / self.test_id
+    def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
+        if expected_filepath.suffix != current_filepath.suffix:
+            return True
+        if file_equality_check := self.file_equality_check:
+            return not file_equality_check(expected_filepath, current_filepath)
+        current_suffix = expected_filepath.suffix
+        if current_suffix in file_type_equality_check:
+            equality_check_callable = file_type_equality_check[current_suffix]
+            return not equality_check_callable(
+                expected_filepath=expected_filepath, current_filepath=current_filepath
+            )
+        # Fallback is using filecmp.cmp to compare the files
+        return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
+def reset_dir(dir_path: Path) -> None:
+    shutil.rmtree(path=dir_path, ignore_errors=True)
+    dir_path.mkdir(parents=True)

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.8" # pragma: no cover
1	+ __version__ = "0.3.9" # pragma: no cover

unstructured_ingest/utils/chunking.py CHANGED Viewed

@@ -1,4 +1,7 @@
+import base64
 import hashlib
+import json
+import zlib
 from itertools import groupby
@@ -43,3 +46,11 @@ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
         e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
     return elements
+def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
+    decoded_b64_bytes = base64.b64decode(raw_s)
+    elements_json_bytes = zlib.decompress(decoded_b64_bytes)
+    elements_json_str = elements_json_bytes.decode("utf-8")
+    element_dicts = json.loads(elements_json_str)
+    return element_dicts

unstructured_ingest/utils/data_prep.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import itertools
 import json
 from datetime import datetime
+from pathlib import Path
 from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
+import ndjson
 import pandas as pd
 DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -131,3 +133,37 @@ def validate_date_args(date: Optional[str] = None) -> bool:
         f"The argument {date} does not satisfy the format:"
         f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
     )
+def get_data(path: Path) -> list[dict]:
+    with path.open() as f:
+        if path.suffix == ".json":
+            return json.load(f)
+        elif path.suffix == ".ndjson":
+            return ndjson.load(f)
+        elif path.suffix == ".csv":
+            df = pd.read_csv(path)
+            return df.to_dict(orient="records")
+        elif path.suffix == ".parquet":
+            df = pd.read_parquet(path)
+            return df.to_dict(orient="records")
+        else:
+            raise ValueError(f"Unsupported file type: {path}")
+def get_data_df(path: Path) -> pd.DataFrame:
+    with path.open() as f:
+        if path.suffix == ".json":
+            data = json.load(f)
+            return pd.DataFrame(data=data)
+        elif path.suffix == ".ndjson":
+            data = ndjson.load(f)
+            return pd.DataFrame(data=data)
+        elif path.suffix == ".csv":
+            df = pd.read_csv(path)
+            return df
+        elif path.suffix == ".parquet":
+            df = pd.read_parquet(path)
+            return df
+        else:
+            raise ValueError(f"Unsupported file type: {path}")

unstructured_ingest/v2/interfaces/upload_stager.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from abc import ABC, abstractmethod
+import json
+from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, TypeVar
+import ndjson
 from pydantic import BaseModel
 from unstructured_ingest.v2.interfaces.file_data import FileData
@@ -20,16 +22,78 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
 class UploadStager(BaseProcess, ABC):
     upload_stager_config: UploadStagerConfigT
-    @abstractmethod
+    def write_output(self, output_path: Path, data: list[dict]) -> None:
+        if output_path.suffix == ".json":
+            with output_path.open("w") as f:
+                json.dump(data, f, indent=2)
+        elif output_path.suffix == ".ndjson":
+            with output_path.open("w") as f:
+                ndjson.dump(data, f)
+        else:
+            raise ValueError(f"Unsupported output format: {output_path}")
+    def get_data(self, elements_filepath: Path) -> list[dict]:
+        if elements_filepath.suffix == ".json":
+            with elements_filepath.open() as f:
+                return json.load(f)
+        elif elements_filepath.suffix == ".ndjson":
+            with elements_filepath.open() as f:
+                return ndjson.load(f)
+        else:
+            raise ValueError(f"Unsupported input format: {elements_filepath}")
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        return element_dict
+    def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
+        output_path = Path(output_filename)
+        output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
+        output_path = Path(output_dir) / Path(f"{output_filename}")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        return output_path
+    def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
+        with input_file.open() as in_f:
+            reader = ndjson.reader(in_f)
+            with output_file.open("w") as out_f:
+                writer = ndjson.writer(out_f)
+                for element in reader:
+                    conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
+                    writer.writerow(row=conformed_element)
+                    writer.f.flush()
+    def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
+        with input_file.open() as in_f:
+            elements_contents = json.load(in_f)
+        conformed_elements = [
+            self.conform_dict(element_dict=element, file_data=file_data)
+            for element in elements_contents
+        ]
+        with open(output_file, "w") as out_f:
+            json.dump(conformed_elements, out_f, indent=2)
     def run(
         self,
         elements_filepath: Path,
         file_data: FileData,
         output_dir: Path,
         output_filename: str,
-        **kwargs: Any
+        **kwargs: Any,
     ) -> Path:
-        pass
+        output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
+        if elements_filepath.suffix == ".ndjson":
+            self.stream_update(
+                input_file=elements_filepath, output_file=output_file, file_data=file_data
+            )
+        elif elements_filepath.suffix == ".json":
+            self.process_whole(
+                input_file=elements_filepath, output_file=output_file, file_data=file_data
+            )
+        else:
+            raise ValueError(f"Unsupported file extension: {elements_filepath}")
+        return output_file
     async def run_async(
         self,
@@ -37,12 +101,12 @@ class UploadStager(BaseProcess, ABC):
         file_data: FileData,
         output_dir: Path,
         output_filename: str,
-        **kwargs: Any
+        **kwargs: Any,
     ) -> Path:
         return self.run(
             elements_filepath=elements_filepath,
             output_dir=output_dir,
             output_filename=output_filename,
             file_data=file_data,
-            **kwargs
+            **kwargs,
         )

unstructured_ingest/v2/interfaces/uploader.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, TypeVar
 from pydantic import BaseModel
+from unstructured_ingest.utils.data_prep import get_data
 from unstructured_ingest.v2.interfaces.connector import BaseConnector
 from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.interfaces.process import BaseProcess
@@ -38,7 +39,15 @@ class Uploader(BaseProcess, BaseConnector, ABC):
         raise NotImplementedError()
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        raise NotImplementedError()
+        data = get_data(path=path)
+        self.run_data(data=data, file_data=file_data, **kwargs)
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs)
+        data = get_data(path=path)
+        await self.run_data_async(data=data, file_data=file_data, **kwargs)
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        raise NotImplementedError()
+    async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        return self.run_data(data=data, file_data=file_data, **kwargs)

unstructured_ingest/v2/pipeline/steps/stage.py CHANGED Viewed

@@ -39,11 +39,13 @@ class UploadStageStep(PipelineStep):
         self, fn: Callable, path: str, file_data_path: str
     ) -> UploadStageStepResponse:
         path = Path(path)
+        # Maintain extension
+        output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
         fn_kwargs = {
             "elements_filepath": path,
             "file_data": FileData.from_file(path=file_data_path),
             "output_dir": self.cache_dir,
-            "output_filename": self.get_hash(extras=[path.name]),
+            "output_filename": output_filename,
         }
         if not asyncio.iscoroutinefunction(fn):
             staged_output_path = fn(**fn_kwargs)

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import copy
 import csv
 import hashlib
-import json
 import sys
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -17,7 +16,7 @@ from unstructured_ingest.error import (
     SourceConnectionError,
     SourceConnectionNetworkError,
 )
-from unstructured_ingest.utils.data_prep import batch_generator
+from unstructured_ingest.utils.data_prep import batch_generator, get_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
 from unstructured_ingest.v2.constants import RECORD_ID_LABEL
@@ -325,29 +324,6 @@ class AstraDBUploadStager(UploadStager):
             "metadata": element_dict,
         }
-    def run(
-        self,
-        elements_filepath: Path,
-        file_data: FileData,
-        output_dir: Path,
-        output_filename: str,
-        **kwargs: Any,
-    ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents = json.load(elements_file)
-        conformed_elements = []
-        for element in elements_contents:
-            conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
-        output_filename_path = Path(output_filename)
-        if output_filename_path.suffix == ".json":
-            output_path = Path(output_dir) / output_filename_path
-        else:
-            output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file, indent=2)
-        return output_path
 @dataclass
 class AstraDBUploader(Uploader):
@@ -386,11 +362,9 @@ class AstraDBUploader(Uploader):
             f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
         )
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        with path.open("r") as file:
-            elements_dict = json.load(file)
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
         logger.info(
-            f"writing {len(elements_dict)} objects to destination "
+            f"writing {len(data)} objects to destination "
             f"collection {self.upload_config.collection_name}"
         )
@@ -399,9 +373,13 @@ class AstraDBUploader(Uploader):
         self.delete_by_record_id(collection=collection, file_data=file_data)
-        for chunk in batch_generator(elements_dict, astra_db_batch_size):
+        for chunk in batch_generator(data, astra_db_batch_size):
             collection.insert_many(chunk)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        data = get_data(path=path)
+        self.run_data(data=data, file_data=file_data, **kwargs)
 astra_db_source_entry = SourceRegistryEntry(
     indexer=AstraDBIndexer,

unstructured_ingest/v2/processes/connectors/azure_ai_search.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
+from contextlib import contextmanager
 from dataclasses import dataclass, field
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Generator
 from pydantic import Field, Secret
@@ -49,29 +49,33 @@ class AzureAISearchConnectionConfig(ConnectionConfig):
     access_config: Secret[AzureAISearchAccessConfig]
     @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
-    def get_search_client(self) -> "SearchClient":
+    @contextmanager
+    def get_search_client(self) -> Generator["SearchClient", None, None]:
         from azure.core.credentials import AzureKeyCredential
         from azure.search.documents import SearchClient
-        return SearchClient(
+        with SearchClient(
             endpoint=self.endpoint,
             index_name=self.index,
             credential=AzureKeyCredential(
                 self.access_config.get_secret_value().azure_ai_search_key
             ),
-        )
+        ) as client:
+            yield client
     @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
-    def get_search_index_client(self) -> "SearchIndexClient":
+    @contextmanager
+    def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
         from azure.core.credentials import AzureKeyCredential
         from azure.search.documents.indexes import SearchIndexClient
-        return SearchIndexClient(
+        with SearchIndexClient(
             endpoint=self.endpoint,
             credential=AzureKeyCredential(
                 self.access_config.get_secret_value().azure_ai_search_key
             ),
-        )
+        ) as search_index_client:
+            yield search_index_client
 class AzureAISearchUploadStagerConfig(UploadStagerConfig):
@@ -92,14 +96,13 @@ class AzureAISearchUploadStager(UploadStager):
         default_factory=lambda: AzureAISearchUploadStagerConfig()
     )
-    @staticmethod
-    def conform_dict(data: dict, file_data: FileData) -> dict:
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         """
         updates the dictionary that is from each Element being converted into a dict/json
         into a dictionary that conforms to the schema expected by the
         Azure Cognitive Search index
         """
+        data = element_dict.copy()
         data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
         data[RECORD_ID_LABEL] = file_data.identifier
@@ -140,31 +143,6 @@ class AzureAISearchUploadStager(UploadStager):
             data["metadata"]["page_number"] = str(page_number)
         return data
-    def run(
-        self,
-        file_data: FileData,
-        elements_filepath: Path,
-        output_dir: Path,
-        output_filename: str,
-        **kwargs: Any,
-    ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents = json.load(elements_file)
-        conformed_elements = [
-            self.conform_dict(data=element, file_data=file_data) for element in elements_contents
-        ]
-        if Path(output_filename).suffix != ".json":
-            output_filename = f"{output_filename}.json"
-        else:
-            output_filename = f"{Path(output_filename).stem}.json"
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file, indent=2)
-        return output_path
 @dataclass
 class AzureAISearchUploader(Uploader):
@@ -270,9 +248,7 @@ class AzureAISearchUploader(Uploader):
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        with path.open("r") as file:
-            elements_dict = json.load(file)
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
         logger.info(
             f"writing document batches to destination"
             f" endpoint at {str(self.connection_config.endpoint)}"
@@ -287,7 +263,7 @@ class AzureAISearchUploader(Uploader):
         batch_size = self.upload_config.batch_size
         with self.connection_config.get_search_client() as search_client:
-            for chunk in batch_generator(elements_dict, batch_size):
+            for chunk in batch_generator(data, batch_size):
                 self.write_dict(elements_dict=chunk, search_client=search_client)  # noqa: E203

unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.9py3-none-any.whl