PyPI - unstructured-ingest - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show

test/integration/connectors/utils/validation/destination.py ADDED Viewed

@@ -0,0 +1,88 @@
+import json
+import os
+import shutil
+from pathlib import Path
+import ndjson
+from test.integration.connectors.utils.validation.utils import ValidationConfig
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, UploadStager
+class StagerValidationConfigs(ValidationConfig):
+    expected_count: int
+    def stager_output_dir(self) -> Path:
+        dir = self.test_output_dir() / "stager"
+        dir.mkdir(exist_ok=True, parents=True)
+        return dir
+    def stager_output_path(self, input_path: Path) -> Path:
+        return self.stager_output_dir() / input_path.name
+def run_all_stager_validations(
+    configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
+):
+    # Validate matching extensions
+    assert input_file.suffix == staged_filepath.suffix
+    # Validate length
+    staged_data = get_data(staged_filepath=staged_filepath)
+    assert len(staged_data) == configs.expected_count
+    # Validate file
+    expected_filepath = configs.stager_output_path(input_path=input_file)
+    assert expected_filepath.exists(), f"{expected_filepath} does not exist"
+    assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
+    if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
+        raise AssertionError(
+            f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
+        )
+def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
+    copied_filepath = stager_output_path / staged_filepath.name
+    shutil.copy(staged_filepath, copied_filepath)
+def get_data(staged_filepath: Path) -> list[dict]:
+    if staged_filepath.suffix == ".json":
+        with staged_filepath.open() as f:
+            return json.load(f)
+    elif staged_filepath.suffix == ".ndjson":
+        with staged_filepath.open() as f:
+            return ndjson.load(f)
+    else:
+        raise ValueError(f"Unsupported file type: {staged_filepath.suffix}")
+def stager_validation(
+    stager: UploadStager,
+    tmp_dir: Path,
+    input_file: Path,
+    configs: StagerValidationConfigs,
+    overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
+) -> None:
+    # Run stager
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
+        connector_type=configs.test_id,
+        identifier="mock file data",
+    )
+    staged_filepath = stager.run(
+        elements_filepath=input_file,
+        file_data=file_data,
+        output_dir=tmp_dir,
+        output_filename=input_file.name,
+    )
+    if not overwrite_fixtures:
+        print("Running validation")
+        run_all_stager_validations(
+            configs=configs, input_file=input_file, staged_filepath=staged_filepath
+        )
+    else:
+        print("Running fixtures update")
+        update_stager_fixtures(
+            stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
+        )

test/integration/connectors/utils/validation/equality.py ADDED Viewed

@@ -0,0 +1,75 @@
+import json
+from pathlib import Path
+import ndjson
+from bs4 import BeautifulSoup
+from deepdiff import DeepDiff
+def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as f:
+        expected_data = json.load(f)
+    with current_filepath.open() as f:
+        current_data = json.load(f)
+    diff = DeepDiff(expected_data, current_data)
+    if diff:
+        print("diff between expected and current json")
+        print(diff.to_json(indent=2))
+        return False
+    return True
+def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as f:
+        expected_data = ndjson.load(f)
+    with current_filepath.open() as f:
+        current_data = ndjson.load(f)
+    if len(current_data) != len(expected_data):
+        print(
+            f"expected data length {len(expected_data)} "
+            f"didn't match current results: {len(current_data)}"
+        )
+    for i in range(len(expected_data)):
+        e = expected_data[i]
+        r = current_data[i]
+        if e != r:
+            print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
+            return False
+    return True
+def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_soup = BeautifulSoup(expected_f, "html.parser")
+    with current_filepath.open() as current_f:
+        current_soup = BeautifulSoup(current_f, "html.parser")
+    return expected_soup.text == current_soup.text
+def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_text_lines = expected_f.readlines()
+    with current_filepath.open() as current_f:
+        current_text_lines = current_f.readlines()
+    if len(expected_text_lines) != len(current_text_lines):
+        print(
+            f"Lines in expected text file ({len(expected_text_lines)}) "
+            f"don't match current text file ({len(current_text_lines)})"
+        )
+        return False
+    expected_text = "\n".join(expected_text_lines)
+    current_text = "\n".join(current_text_lines)
+    if expected_text == current_text:
+        return True
+    print("txt content don't match:")
+    print(f"expected: {expected_text}")
+    print(f"current: {current_text}")
+    return False
+file_type_equality_check = {
+    ".json": json_equality_check,
+    ".ndjson": ndjson_equality_check,
+    ".html": html_equality_check,
+    ".txt": txt_equality_check,
+}

test/integration/connectors/utils/{validation.py → validation/source.py} RENAMED Viewed

@@ -1,83 +1,27 @@
-import filecmp
 import json
 import os
 import shutil
-from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import Callable, Optional
-import pandas as pd
-from bs4 import BeautifulSoup
 from deepdiff import DeepDiff
+from pydantic import Field
-from test.integration.connectors.utils.constants import expected_results_path
+from test.integration.connectors.utils.validation.utils import ValidationConfig
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
-def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    expected_df = pd.read_csv(expected_filepath)
-    current_df = pd.read_csv(current_filepath)
-    if expected_df.equals(current_df):
-        return True
-    # Print diff
-    diff = expected_df.merge(current_df, indicator=True, how="left").loc[
-        lambda x: x["_merge"] != "both"
-    ]
-    print("diff between expected and current df:")
-    print(diff)
-    return False
-def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as expected_f:
-        expected_soup = BeautifulSoup(expected_f, "html.parser")
-    with current_filepath.open() as current_f:
-        current_soup = BeautifulSoup(current_f, "html.parser")
-    return expected_soup.text == current_soup.text
-def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as expected_f:
-        expected_text_lines = expected_f.readlines()
-    with current_filepath.open() as current_f:
-        current_text_lines = current_f.readlines()
-    if len(expected_text_lines) != len(current_text_lines):
-        print(
-            f"Lines in expected text file ({len(expected_text_lines)}) "
-            f"don't match current text file ({len(current_text_lines)})"
-        )
-        return False
-    expected_text = "\n".join(expected_text_lines)
-    current_text = "\n".join(current_text_lines)
-    if expected_text == current_text:
-        return True
-    print("txt content don't match:")
-    print(f"expected: {expected_text}")
-    print(f"current: {current_text}")
-    return False
-file_type_equality_check = {
-    ".json": json_equality_check,
-    ".html": html_equality_check,
-    ".txt": txt_equality_check,
-}
-@dataclass
-class ValidationConfigs:
-    test_id: str
+class SourceValidationConfigs(ValidationConfig):
     expected_number_indexed_file_data: Optional[int] = None
     expected_num_files: Optional[int] = None
     predownload_file_data_check: Optional[Callable[[FileData], None]] = None
     postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
-    exclude_fields: list[str] = field(
+    exclude_fields: list[str] = Field(
         default_factory=lambda: ["local_download_path", "metadata.date_processed"]
     )
-    exclude_fields_extend: list[str] = field(default_factory=list)
+    exclude_fields_extend: list[str] = Field(default_factory=list)
     validate_downloaded_files: bool = False
     validate_file_data: bool = True
-    downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
     def get_exclude_fields(self) -> list[str]:
         exclude_fields = self.exclude_fields
@@ -97,9 +41,6 @@ class ValidationConfigs:
             downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
             assert len(downloaded_files) == expected_num_files
-    def test_output_dir(self) -> Path:
-        return expected_results_path / self.test_id
     def omit_ignored_fields(self, data: dict) -> dict:
         exclude_fields = self.get_exclude_fields()
         # Ignore fields that dynamically change every time the tests run
@@ -143,14 +84,14 @@ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
 def check_contents(
-    expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
+    expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
 ):
     found_diff = False
     for file_data in all_file_data:
         file_data_path = expected_output_dir / f"{file_data.identifier}.json"
         with file_data_path.open("r") as file:
             expected_file_data_contents = json.load(file)
-        current_file_data_contents = file_data.to_dict()
+        current_file_data_contents = file_data.model_dump()
         expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
         current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
         diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -160,27 +101,10 @@ def check_contents(
     assert not found_diff, f"Diffs found between files: {found_diff}"
-def detect_diff(
-    configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
-) -> bool:
-    if expected_filepath.suffix != current_filepath.suffix:
-        return True
-    if downloaded_file_equality_check := configs.downloaded_file_equality_check:
-        return not downloaded_file_equality_check(expected_filepath, current_filepath)
-    current_suffix = expected_filepath.suffix
-    if current_suffix in file_type_equality_check:
-        equality_check_callable = file_type_equality_check[current_suffix]
-        return not equality_check_callable(
-            expected_filepath=expected_filepath, current_filepath=current_filepath
-        )
-    # Fallback is using filecmp.cmp to compare the files
-    return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
 def check_raw_file_contents(
     expected_output_dir: Path,
     current_output_dir: Path,
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
 ):
     current_files = get_files(dir_path=current_output_dir)
     found_diff = False
@@ -188,7 +112,7 @@ def check_raw_file_contents(
     for current_file in current_files:
         current_file_path = current_output_dir / current_file
         expected_file_path = expected_output_dir / current_file
-        if detect_diff(configs, expected_file_path, current_file_path):
+        if configs.detect_diff(expected_file_path, current_file_path):
             found_diff = True
             files.append(str(expected_file_path))
             print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -196,7 +120,7 @@ def check_raw_file_contents(
 def run_expected_results_validation(
-    expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
+    expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
 ):
     check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
     check_contents(
@@ -207,7 +131,7 @@ def run_expected_results_validation(
 def run_expected_download_files_validation(
     expected_output_dir: Path,
     current_download_dir: Path,
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
 ):
     check_files_in_paths(
         expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
@@ -234,12 +158,12 @@ def update_fixtures(
     save_downloads: bool = False,
     save_filedata: bool = True,
 ):
-    # Delete current files
-    shutil.rmtree(path=output_dir, ignore_errors=True)
-    output_dir.mkdir(parents=True)
     # Rewrite the current file data
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True)
     if save_filedata:
         file_data_output_path = output_dir / "file_data"
+        shutil.rmtree(path=file_data_output_path, ignore_errors=True)
         print(
             f"Writing {len(all_file_data)} file data to "
             f"saved fixture location {file_data_output_path}"
@@ -248,7 +172,7 @@ def update_fixtures(
         for file_data in all_file_data:
             file_data_path = file_data_output_path / f"{file_data.identifier}.json"
             with file_data_path.open(mode="w") as f:
-                json.dump(file_data.to_dict(), f, indent=2)
+                json.dump(file_data.model_dump(), f, indent=2)
     # Record file structure of download directory
     download_files = get_files(dir_path=download_dir)
@@ -260,6 +184,7 @@ def update_fixtures(
     # If applicable, save raw downloads
     if save_downloads:
         raw_download_output_path = output_dir / "downloads"
+        shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
         print(
             f"Writing {len(download_files)} downloaded files to "
             f"saved fixture location {raw_download_output_path}"
@@ -268,7 +193,7 @@ def update_fixtures(
 def run_all_validations(
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
     predownload_file_data: list[FileData],
     postdownload_file_data: list[FileData],
     download_dir: Path,
@@ -289,7 +214,10 @@ def run_all_validations(
     if configs.validate_file_data:
         run_expected_results_validation(
             expected_output_dir=test_output_dir / "file_data",
-            all_file_data=postdownload_file_data,
+            all_file_data=get_all_file_data(
+                all_predownload_file_data=predownload_file_data,
+                all_postdownload_file_data=postdownload_file_data,
+            ),
             configs=configs,
         )
     download_files = get_files(dir_path=download_dir)
@@ -305,10 +233,23 @@ def run_all_validations(
         )
+def get_all_file_data(
+    all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
+) -> list[FileData]:
+    all_file_data = all_postdownload_file_data
+    indexed_file_data = [
+        fd
+        for fd in all_predownload_file_data
+        if fd.identifier not in [f.identifier for f in all_file_data]
+    ]
+    all_file_data += indexed_file_data
+    return all_file_data
 async def source_connector_validation(
     indexer: Indexer,
     downloader: Downloader,
-    configs: ValidationConfigs,
+    configs: SourceValidationConfigs,
     overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
 ) -> None:
     # Run common validations on the process of running a source connector, supporting dynamic
@@ -322,7 +263,7 @@ async def source_connector_validation(
     test_output_dir = configs.test_output_dir()
     for file_data in indexer.run():
         assert file_data
-        predownload_file_data = replace(file_data)
+        predownload_file_data = file_data.model_copy(deep=True)
         all_predownload_file_data.append(predownload_file_data)
         if downloader.is_async():
             resp = await downloader.run_async(file_data=file_data)
@@ -330,10 +271,10 @@ async def source_connector_validation(
             resp = downloader.run(file_data=file_data)
         if isinstance(resp, list):
             for r in resp:
-                postdownload_file_data = replace(r["file_data"])
+                postdownload_file_data = r["file_data"].model_copy(deep=True)
                 all_postdownload_file_data.append(postdownload_file_data)
         else:
-            postdownload_file_data = replace(resp["file_data"])
+            postdownload_file_data = resp["file_data"].model_copy(deep=True)
             all_postdownload_file_data.append(postdownload_file_data)
     if not overwrite_fixtures:
         print("Running validation")
@@ -349,7 +290,10 @@ async def source_connector_validation(
         update_fixtures(
             output_dir=test_output_dir,
             download_dir=download_dir,
-            all_file_data=all_postdownload_file_data,
+            all_file_data=get_all_file_data(
+                all_predownload_file_data=all_predownload_file_data,
+                all_postdownload_file_data=all_postdownload_file_data,
+            ),
             save_downloads=configs.validate_downloaded_files,
             save_filedata=configs.validate_file_data,
         )

test/integration/connectors/utils/validation/utils.py ADDED Viewed

@@ -0,0 +1,36 @@
+import filecmp
+import shutil
+from pathlib import Path
+from typing import Callable, Optional
+from pydantic import BaseModel
+from test.integration.connectors.utils.constants import expected_results_path
+from test.integration.connectors.utils.validation.equality import file_type_equality_check
+class ValidationConfig(BaseModel):
+    test_id: str
+    file_equality_check: Optional[Callable[[Path, Path], bool]] = None
+    def test_output_dir(self) -> Path:
+        return expected_results_path / self.test_id
+    def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
+        if expected_filepath.suffix != current_filepath.suffix:
+            return True
+        if file_equality_check := self.file_equality_check:
+            return not file_equality_check(expected_filepath, current_filepath)
+        current_suffix = expected_filepath.suffix
+        if current_suffix in file_type_equality_check:
+            equality_check_callable = file_type_equality_check[current_suffix]
+            return not equality_check_callable(
+                expected_filepath=expected_filepath, current_filepath=current_filepath
+            )
+        # Fallback is using filecmp.cmp to compare the files
+        return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
+def reset_dir(dir_path: Path) -> None:
+    shutil.rmtree(path=dir_path, ignore_errors=True)
+    dir_path.mkdir(parents=True)

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.8" # pragma: no cover
1	+ __version__ = "0.3.10" # pragma: no cover

unstructured_ingest/utils/chunking.py CHANGED Viewed

@@ -1,4 +1,7 @@
+import base64
 import hashlib
+import json
+import zlib
 from itertools import groupby
@@ -43,3 +46,11 @@ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
         e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
     return elements
+def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
+    decoded_b64_bytes = base64.b64decode(raw_s)
+    elements_json_bytes = zlib.decompress(decoded_b64_bytes)
+    elements_json_str = elements_json_bytes.decode("utf-8")
+    element_dicts = json.loads(elements_json_str)
+    return element_dicts

unstructured_ingest/utils/data_prep.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import itertools
 import json
 from datetime import datetime
+from pathlib import Path
 from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
+import ndjson
 import pandas as pd
 DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -131,3 +133,37 @@ def validate_date_args(date: Optional[str] = None) -> bool:
         f"The argument {date} does not satisfy the format:"
         f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
     )
+def get_data(path: Path) -> list[dict]:
+    with path.open() as f:
+        if path.suffix == ".json":
+            return json.load(f)
+        elif path.suffix == ".ndjson":
+            return ndjson.load(f)
+        elif path.suffix == ".csv":
+            df = pd.read_csv(path)
+            return df.to_dict(orient="records")
+        elif path.suffix == ".parquet":
+            df = pd.read_parquet(path)
+            return df.to_dict(orient="records")
+        else:
+            raise ValueError(f"Unsupported file type: {path}")
+def get_data_df(path: Path) -> pd.DataFrame:
+    with path.open() as f:
+        if path.suffix == ".json":
+            data = json.load(f)
+            return pd.DataFrame(data=data)
+        elif path.suffix == ".ndjson":
+            data = ndjson.load(f)
+            return pd.DataFrame(data=data)
+        elif path.suffix == ".csv":
+            df = pd.read_csv(path)
+            return df
+        elif path.suffix == ".parquet":
+            df = pd.read_parquet(path)
+            return df
+        else:
+            raise ValueError(f"Unsupported file type: {path}")

unstructured_ingest/v2/interfaces/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .connector import AccessConfig, BaseConnector, ConnectionConfig
 from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
-from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
+from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
 from .indexer import Indexer, IndexerConfig
 from .process import BaseProcess
 from .processor import ProcessorConfig
@@ -27,4 +27,6 @@ __all__ = [
     "ConnectionConfig",
     "BaseConnector",
     "FileDataSourceMetadata",
+    "BatchFileData",
+    "BatchItem",
 ]

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import json
-from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Literal, Optional
+from typing import Any, Optional
+from uuid import NAMESPACE_DNS, uuid5
-from dataclasses_json import DataClassJsonMixin
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
+from unstructured_ingest.v2.logger import logger
-@dataclass
-class SourceIdentifiers:
+class SourceIdentifiers(BaseModel):
     filename: str
     fullpath: str
     rel_path: Optional[str] = None
@@ -21,8 +22,7 @@ class SourceIdentifiers:
         return self.rel_path or self.fullpath
-@dataclass
-class FileDataSourceMetadata(DataClassJsonMixin):
+class FileDataSourceMetadata(BaseModel):
     url: Optional[str] = None
     version: Optional[str] = None
     record_locator: Optional[dict[str, Any]] = None
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
     filesize_bytes: Optional[int] = None
-@dataclass
-class FileData(DataClassJsonMixin):
+class FileData(BaseModel):
     identifier: str
     connector_type: str
     source_identifiers: Optional[SourceIdentifiers] = None
-    doc_type: Literal["file", "batch"] = field(default="file")
-    metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
-    additional_metadata: dict[str, Any] = field(default_factory=dict)
+    metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
+    additional_metadata: dict[str, Any] = Field(default_factory=dict)
     reprocess: bool = False
     local_download_path: Optional[str] = None
     display_name: Optional[str] = None
@@ -52,11 +50,57 @@ class FileData(DataClassJsonMixin):
             raise ValueError(f"file path not valid: {path}")
         with open(str(path.resolve()), "rb") as f:
             file_data_dict = json.load(f)
-        file_data = FileData.from_dict(file_data_dict)
+        file_data = cls.model_validate(file_data_dict)
         return file_data
+    @classmethod
+    def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
+        file_data_dict = file_data.model_dump()
+        return cls.model_validate(file_data_dict, **kwargs)
     def to_file(self, path: str) -> None:
         path = Path(path).resolve()
         path.parent.mkdir(parents=True, exist_ok=True)
         with open(str(path.resolve()), "w") as f:
-            json.dump(self.to_dict(), f, indent=2)
+            json.dump(self.model_dump(), f, indent=2)
+class BatchItem(BaseModel):
+    identifier: str
+    version: Optional[str] = None
+class BatchFileData(FileData):
+    identifier: str = Field(init=False)
+    batch_items: list[BatchItem]
+    @field_validator("batch_items")
+    @classmethod
+    def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
+        if not v:
+            raise ValueError("batch items cannot be empty")
+        all_identifiers = [item.identifier for item in v]
+        if len(all_identifiers) != len(set(all_identifiers)):
+            raise ValueError(f"duplicate identifiers: {all_identifiers}")
+        sorted_batch_items = sorted(v, key=lambda item: item.identifier)
+        return sorted_batch_items
+    @model_validator(mode="before")
+    @classmethod
+    def populate_identifier(cls, data: Any) -> Any:
+        if isinstance(data, dict) and "identifier" not in data:
+            batch_items = data["batch_items"]
+            identifier_data = json.dumps(
+                {item.identifier: item.version for item in batch_items}, sort_keys=True
+            )
+            data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
+        return data
+def file_data_from_file(path: str) -> FileData:
+    try:
+        return BatchFileData.from_file(path=path)
+    except ValidationError:
+        logger.debug(f"{path} not valid for batch file data")
+    return FileData.from_file(path=path)

unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl