PyPI - unstructured-ingest - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

unstructured-ingest 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (52) hide show

test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import json
 import os
-import tempfile
 import uuid
 from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
+from unittest import mock
 import pytest
 from databricks.sdk import WorkspaceClient
@@ -31,11 +31,15 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
 @dataclass
-class EnvData:
+class BaseEnvData:
     host: str
+    catalog: str
+@dataclass
+class BasicAuthEnvData(BaseEnvData):
     client_id: str
     client_secret: str
-    catalog: str
     def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
         return DatabricksNativeVolumesConnectionConfig(
@@ -47,8 +51,21 @@ class EnvData:
         )
-def get_env_data() -> EnvData:
-    return EnvData(
+@dataclass
+class PATEnvData(BaseEnvData):
+    token: str
+    def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
+        return DatabricksNativeVolumesConnectionConfig(
+            host=self.host,
+            access_config=DatabricksNativeVolumesAccessConfig(
+                token=self.token,
+            ),
+        )
+def get_basic_auth_env_data() -> BasicAuthEnvData:
+    return BasicAuthEnvData(
         host=os.environ["DATABRICKS_HOST"],
         client_id=os.environ["DATABRICKS_CLIENT_ID"],
         client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
@@ -56,23 +73,30 @@ def get_env_data() -> EnvData:
     )
+def get_pat_env_data() -> PATEnvData:
+    return PATEnvData(
+        host=os.environ["DATABRICKS_HOST"],
+        catalog=os.environ["DATABRICKS_CATALOG"],
+        token=os.environ["DATABRICKS_PAT"],
+    )
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
 @requires_env(
     "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
 )
-async def test_volumes_native_source():
-    env_data = get_env_data()
-    indexer_config = DatabricksNativeVolumesIndexerConfig(
-        recursive=True,
-        volume="test-platform",
-        volume_path="databricks-volumes-test-input",
-        catalog=env_data.catalog,
-    )
-    connection_config = env_data.get_connection_config()
-    with tempfile.TemporaryDirectory() as tempdir:
-        tempdir_path = Path(tempdir)
-        download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
+async def test_volumes_native_source(tmp_path: Path):
+    env_data = get_basic_auth_env_data()
+    with mock.patch.dict(os.environ, clear=True):
+        indexer_config = DatabricksNativeVolumesIndexerConfig(
+            recursive=True,
+            volume="test-platform",
+            volume_path="databricks-volumes-test-input",
+            catalog=env_data.catalog,
+        )
+        connection_config = env_data.get_connection_config()
+        download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
         indexer = DatabricksNativeVolumesIndexer(
             connection_config=connection_config, index_config=indexer_config
         )
@@ -89,12 +113,44 @@ async def test_volumes_native_source():
         )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
+async def test_volumes_native_source_pat(tmp_path: Path):
+    env_data = get_pat_env_data()
+    with mock.patch.dict(os.environ, clear=True):
+        indexer_config = DatabricksNativeVolumesIndexerConfig(
+            recursive=True,
+            volume="test-platform",
+            volume_path="databricks-volumes-test-input",
+            catalog=env_data.catalog,
+        )
+        connection_config = env_data.get_connection_config()
+        download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
+        indexer = DatabricksNativeVolumesIndexer(
+            connection_config=connection_config, index_config=indexer_config
+        )
+        downloader = DatabricksNativeVolumesDownloader(
+            connection_config=connection_config, download_config=download_config
+        )
+        await source_connector_validation(
+            indexer=indexer,
+            downloader=downloader,
+            configs=SourceValidationConfigs(
+                test_id="databricks_volumes_native_pat",
+                expected_num_files=1,
+            ),
+        )
 def _get_volume_path(catalog: str, volume: str, volume_path: str):
     return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
 @contextmanager
-def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
+def databricks_destination_context(
+    env_data: BasicAuthEnvData, volume: str, volume_path
+) -> WorkspaceClient:
     client = WorkspaceClient(
         host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
     )
@@ -137,7 +193,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
     "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
 )
 async def test_volumes_native_destination(upload_file: Path):
-    env_data = get_env_data()
+    env_data = get_basic_auth_env_data()
     volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
     file_data = FileData(
         source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),

test/integration/connectors/sql/test_postgres.py CHANGED Viewed

@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
     SourceValidationConfigs,
     source_connector_validation,
 )
-from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.sql.postgres import (
     CONNECTOR_TYPE,
     PostgresAccessConfig,
@@ -28,7 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
     PostgresUploadStager,
 )
-SEED_DATA_ROWS = 20
+SEED_DATA_ROWS = 10
 @pytest.fixture
@@ -69,7 +69,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
     )
     indexer = PostgresIndexer(
         connection_config=connection_config,
-        index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
+        index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
     )
     downloader = PostgresDownloader(
         connection_config=connection_config,
@@ -81,7 +81,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
         configs=SourceValidationConfigs(
             test_id="postgres",
             expected_num_files=SEED_DATA_ROWS,
-            expected_number_indexed_file_data=4,
+            expected_number_indexed_file_data=2,
             validate_downloaded_files=True,
         ),
     )
@@ -119,7 +119,11 @@ def validate_destination(
 async def test_postgres_destination(upload_file: Path, temp_dir: Path):
     # the postgres destination connector doesn't leverage the file data but is required as an input,
     # mocking it with arbitrary values to meet the base requirements:
-    mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
+    mock_file_data = FileData(
+        identifier="mock file data",
+        connector_type=CONNECTOR_TYPE,
+        source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
+    )
     with docker_compose_context(
         docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
     ):

test/integration/connectors/sql/test_singlestore.py CHANGED Viewed

@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
     SourceValidationConfigs,
     source_connector_validation,
 )
-from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
     CONNECTOR_TYPE,
     SingleStoreAccessConfig,
@@ -29,7 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
     SingleStoreUploadStager,
 )
-SEED_DATA_ROWS = 20
+SEED_DATA_ROWS = 10
 @pytest.fixture
@@ -66,7 +66,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
     )
     indexer = SingleStoreIndexer(
         connection_config=connection_config,
-        index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
+        index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
     )
     downloader = SingleStoreDownloader(
         connection_config=connection_config,
@@ -80,7 +80,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
         configs=SourceValidationConfigs(
             test_id="singlestore",
             expected_num_files=SEED_DATA_ROWS,
-            expected_number_indexed_file_data=4,
+            expected_number_indexed_file_data=2,
             validate_downloaded_files=True,
         ),
     )
@@ -103,7 +103,11 @@ def validate_destination(
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
 async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
-    mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
+    mock_file_data = FileData(
+        identifier="mock file data",
+        connector_type=CONNECTOR_TYPE,
+        source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
+    )
     with docker_compose_context(
         docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
     ):

test/integration/connectors/sql/test_snowflake.py CHANGED Viewed

@@ -17,7 +17,7 @@ from test.integration.connectors.utils.validation.source import (
     source_connector_validation,
 )
 from test.integration.utils import requires_env
-from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
     CONNECTOR_TYPE,
     SnowflakeAccessConfig,
@@ -170,7 +170,11 @@ async def test_snowflake_destination(
 ):
     # the postgres destination connector doesn't leverage the file data but is required as an input,
     # mocking it with arbitrary values to meet the base requirements:
-    mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
+    mock_file_data = FileData(
+        identifier="mock file data",
+        connector_type=CONNECTOR_TYPE,
+        source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
+    )
     init_db_destination()
     stager = SnowflakeUploadStager()
     staged_path = stager.run(

test/integration/connectors/sql/test_sqlite.py CHANGED Viewed

@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
     SourceValidationConfigs,
     source_connector_validation,
 )
-from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
     CONNECTOR_TYPE,
     SQLiteConnectionConfig,
@@ -27,7 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
     SQLiteUploadStager,
 )
-SEED_DATA_ROWS = 20
+SEED_DATA_ROWS = 10
 @pytest.fixture
@@ -57,7 +57,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
     connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
     indexer = SQLiteIndexer(
         connection_config=connection_config,
-        index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
+        index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
     )
     downloader = SQLiteDownloader(
         connection_config=connection_config,
@@ -69,7 +69,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
         configs=SourceValidationConfigs(
             test_id="sqlite",
             expected_num_files=SEED_DATA_ROWS,
-            expected_number_indexed_file_data=4,
+            expected_number_indexed_file_data=2,
             validate_downloaded_files=True,
         ),
     )
@@ -116,7 +116,11 @@ async def test_sqlite_destination(
 ):
     # the sqlite destination connector doesn't leverage the file data but is required as an input,
     # mocking it with arbitrary values to meet the base requirements:
-    mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
+    mock_file_data = FileData(
+        identifier="mock file data",
+        connector_type=CONNECTOR_TYPE,
+        source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
+    )
     stager = SQLiteUploadStager()
     staged_path = stager.run(
         elements_filepath=upload_file,

test/integration/connectors/test_astradb.py CHANGED Viewed

@@ -14,12 +14,18 @@ from test.integration.connectors.utils.validation.destination import (
     StagerValidationConfigs,
     stager_validation,
 )
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
+    source_connector_validation,
+)
 from test.integration.utils import requires_env
 from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.astradb import (
     CONNECTOR_TYPE,
     AstraDBAccessConfig,
     AstraDBConnectionConfig,
+    AstraDBDownloader,
+    AstraDBDownloaderConfig,
     AstraDBIndexer,
     AstraDBIndexerConfig,
     AstraDBUploader,
@@ -110,6 +116,40 @@ def collection(upload_file: Path) -> Collection:
         astra_db.drop_collection(collection)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
+async def test_astra_search_source(
+    tmp_path: Path,
+):
+    env_data = get_env_data()
+    collection_name = "ingest_test_src"
+    connection_config = AstraDBConnectionConfig(
+        access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
+    )
+    indexer = AstraDBIndexer(
+        index_config=AstraDBIndexerConfig(
+            collection_name=collection_name,
+        ),
+        connection_config=connection_config,
+    )
+    downloader = AstraDBDownloader(
+        connection_config=connection_config,
+        download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
+    )
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id=CONNECTOR_TYPE,
+            expected_num_files=5,
+            expected_number_indexed_file_data=1,
+            validate_downloaded_files=True,
+        ),
+    )
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
 @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")

test/integration/connectors/test_kafka.py CHANGED Viewed

@@ -122,7 +122,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
             indexer=indexer,
             downloader=downloader,
             configs=SourceValidationConfigs(
-                test_id="kafka", expected_num_files=5, validate_downloaded_files=True
+                test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
             ),
         )
@@ -204,7 +204,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
             indexer=indexer,
             downloader=downloader,
             configs=SourceValidationConfigs(
-                test_id="kafka",
+                test_id="kafka-cloud",
                 exclude_fields_extend=["connector_type"],
                 expected_num_files=expected_messages,
                 validate_downloaded_files=True,

test/integration/connectors/test_mongodb.py CHANGED Viewed

@@ -197,7 +197,10 @@ async def test_mongodb_source(temp_dir: Path):
         indexer=indexer,
         downloader=downloader,
         configs=SourceValidationConfigs(
-            test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
+            test_id=CONNECTOR_TYPE,
+            expected_num_files=4,
+            validate_downloaded_files=True,
+            expected_number_indexed_file_data=1,
         ),
     )

test/integration/connectors/utils/validation/source.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import json
 import os
 import shutil
-from dataclasses import replace
 from pathlib import Path
 from typing import Callable, Optional
 from deepdiff import DeepDiff
 from pydantic import Field
-from test.integration.connectors.utils.validation.utils import ValidationConfig, reset_dir
+from test.integration.connectors.utils.validation.utils import ValidationConfig
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
@@ -92,7 +91,7 @@ def check_contents(
         file_data_path = expected_output_dir / f"{file_data.identifier}.json"
         with file_data_path.open("r") as file:
             expected_file_data_contents = json.load(file)
-        current_file_data_contents = file_data.to_dict()
+        current_file_data_contents = file_data.model_dump()
         expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
         current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
         diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -160,9 +159,11 @@ def update_fixtures(
     save_filedata: bool = True,
 ):
     # Rewrite the current file data
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True)
     if save_filedata:
         file_data_output_path = output_dir / "file_data"
-        reset_dir(dir_path=file_data_output_path)
+        shutil.rmtree(path=file_data_output_path, ignore_errors=True)
         print(
             f"Writing {len(all_file_data)} file data to "
             f"saved fixture location {file_data_output_path}"
@@ -171,7 +172,7 @@ def update_fixtures(
         for file_data in all_file_data:
             file_data_path = file_data_output_path / f"{file_data.identifier}.json"
             with file_data_path.open(mode="w") as f:
-                json.dump(file_data.to_dict(), f, indent=2)
+                json.dump(file_data.model_dump(), f, indent=2)
     # Record file structure of download directory
     download_files = get_files(dir_path=download_dir)
@@ -183,7 +184,7 @@ def update_fixtures(
     # If applicable, save raw downloads
     if save_downloads:
         raw_download_output_path = output_dir / "downloads"
-        reset_dir(raw_download_output_path)
+        shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
         print(
             f"Writing {len(download_files)} downloaded files to "
             f"saved fixture location {raw_download_output_path}"
@@ -213,7 +214,10 @@ def run_all_validations(
     if configs.validate_file_data:
         run_expected_results_validation(
             expected_output_dir=test_output_dir / "file_data",
-            all_file_data=postdownload_file_data,
+            all_file_data=get_all_file_data(
+                all_predownload_file_data=predownload_file_data,
+                all_postdownload_file_data=postdownload_file_data,
+            ),
             configs=configs,
         )
     download_files = get_files(dir_path=download_dir)
@@ -229,6 +233,19 @@ def run_all_validations(
         )
+def get_all_file_data(
+    all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
+) -> list[FileData]:
+    all_file_data = all_postdownload_file_data
+    indexed_file_data = [
+        fd
+        for fd in all_predownload_file_data
+        if fd.identifier not in [f.identifier for f in all_file_data]
+    ]
+    all_file_data += indexed_file_data
+    return all_file_data
 async def source_connector_validation(
     indexer: Indexer,
     downloader: Downloader,
@@ -246,7 +263,7 @@ async def source_connector_validation(
     test_output_dir = configs.test_output_dir()
     for file_data in indexer.run():
         assert file_data
-        predownload_file_data = replace(file_data)
+        predownload_file_data = file_data.model_copy(deep=True)
         all_predownload_file_data.append(predownload_file_data)
         if downloader.is_async():
             resp = await downloader.run_async(file_data=file_data)
@@ -254,10 +271,10 @@ async def source_connector_validation(
             resp = downloader.run(file_data=file_data)
         if isinstance(resp, list):
             for r in resp:
-                postdownload_file_data = replace(r["file_data"])
+                postdownload_file_data = r["file_data"].model_copy(deep=True)
                 all_postdownload_file_data.append(postdownload_file_data)
         else:
-            postdownload_file_data = replace(resp["file_data"])
+            postdownload_file_data = resp["file_data"].model_copy(deep=True)
             all_postdownload_file_data.append(postdownload_file_data)
     if not overwrite_fixtures:
         print("Running validation")
@@ -273,7 +290,10 @@ async def source_connector_validation(
         update_fixtures(
             output_dir=test_output_dir,
             download_dir=download_dir,
-            all_file_data=all_postdownload_file_data,
+            all_file_data=get_all_file_data(
+                all_predownload_file_data=all_predownload_file_data,
+                all_postdownload_file_data=all_postdownload_file_data,
+            ),
             save_downloads=configs.validate_downloaded_files,
             save_filedata=configs.validate_file_data,
         )

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.9" # pragma: no cover
1	+ __version__ = "0.3.11" # pragma: no cover

unstructured_ingest/v2/interfaces/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .connector import AccessConfig, BaseConnector, ConnectionConfig
 from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
-from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
+from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
 from .indexer import Indexer, IndexerConfig
 from .process import BaseProcess
 from .processor import ProcessorConfig
@@ -27,4 +27,6 @@ __all__ = [
     "ConnectionConfig",
     "BaseConnector",
     "FileDataSourceMetadata",
+    "BatchFileData",
+    "BatchItem",
 ]

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import json
-from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Literal, Optional
+from typing import Any, Optional
+from uuid import NAMESPACE_DNS, uuid5
-from dataclasses_json import DataClassJsonMixin
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
+from unstructured_ingest.v2.logger import logger
-@dataclass
-class SourceIdentifiers:
+class SourceIdentifiers(BaseModel):
     filename: str
     fullpath: str
     rel_path: Optional[str] = None
@@ -21,8 +22,7 @@ class SourceIdentifiers:
         return self.rel_path or self.fullpath
-@dataclass
-class FileDataSourceMetadata(DataClassJsonMixin):
+class FileDataSourceMetadata(BaseModel):
     url: Optional[str] = None
     version: Optional[str] = None
     record_locator: Optional[dict[str, Any]] = None
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
     filesize_bytes: Optional[int] = None
-@dataclass
-class FileData(DataClassJsonMixin):
+class FileData(BaseModel):
     identifier: str
     connector_type: str
-    source_identifiers: Optional[SourceIdentifiers] = None
-    doc_type: Literal["file", "batch"] = field(default="file")
-    metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
-    additional_metadata: dict[str, Any] = field(default_factory=dict)
+    source_identifiers: SourceIdentifiers
+    metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
+    additional_metadata: dict[str, Any] = Field(default_factory=dict)
     reprocess: bool = False
     local_download_path: Optional[str] = None
     display_name: Optional[str] = None
@@ -52,11 +50,67 @@ class FileData(DataClassJsonMixin):
             raise ValueError(f"file path not valid: {path}")
         with open(str(path.resolve()), "rb") as f:
             file_data_dict = json.load(f)
-        file_data = FileData.from_dict(file_data_dict)
+        file_data = cls.model_validate(file_data_dict)
         return file_data
+    @classmethod
+    def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
+        file_data_dict = file_data.model_dump()
+        return cls.model_validate(file_data_dict, **kwargs)
     def to_file(self, path: str) -> None:
         path = Path(path).resolve()
         path.parent.mkdir(parents=True, exist_ok=True)
         with open(str(path.resolve()), "w") as f:
-            json.dump(self.to_dict(), f, indent=2)
+            json.dump(self.model_dump(), f, indent=2)
+class BatchItem(BaseModel):
+    identifier: str
+    version: Optional[str] = None
+class BatchFileData(FileData):
+    identifier: str = Field(init=False)
+    batch_items: list[BatchItem]
+    source_identifiers: Optional[SourceIdentifiers] = None
+    @field_validator("batch_items")
+    @classmethod
+    def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
+        if not v:
+            raise ValueError("batch items cannot be empty")
+        all_identifiers = [item.identifier for item in v]
+        if len(all_identifiers) != len(set(all_identifiers)):
+            raise ValueError(f"duplicate identifiers: {all_identifiers}")
+        sorted_batch_items = sorted(v, key=lambda item: item.identifier)
+        return sorted_batch_items
+    @model_validator(mode="before")
+    @classmethod
+    def populate_identifier(cls, data: Any) -> Any:
+        if isinstance(data, dict) and "identifier" not in data:
+            batch_items = data["batch_items"]
+            identifier_data = json.dumps(
+                {item.identifier: item.version for item in batch_items}, sort_keys=True
+            )
+            data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
+        return data
+def file_data_from_file(path: str) -> FileData:
+    try:
+        return BatchFileData.from_file(path=path)
+    except ValidationError:
+        logger.debug(f"{path} not valid for batch file data")
+    return FileData.from_file(path=path)
+def file_data_from_dict(data: dict) -> FileData:
+    try:
+        return BatchFileData.model_validate(data)
+    except ValidationError:
+        logger.debug(f"{data} not valid for batch file data")
+    return FileData.model_validate(data)

unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl