PyPI - unstructured-ingest - Versions diffs - 0.5.21__py3-none-any.whl → 0.5.25__py3-none-any.whl - Mend

unstructured-ingest 0.5.21py3-none-any.whl → 0.5.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (17) hide show

test/integration/connectors/test_vectara.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 import time
+from functools import lru_cache
 from pathlib import Path
 from typing import Generator
 from uuid import uuid4
@@ -25,24 +26,29 @@ from unstructured_ingest.v2.processes.connectors.vectara import (
 )
-def validate_upload(response: dict, expected_data: dict):
+def validate_upload(document: dict, expected_data: dict):
+    logger.info(f"validating document: {document}")
     element_id = expected_data["element_id"]
     expected_text = expected_data["text"]
     filename = expected_data["metadata"]["filename"]
     filetype = expected_data["metadata"]["filetype"]
     page_number = expected_data["metadata"]["page_number"]
-    response = response["search_results"][0]
-    assert response is not None
-    assert response["text"] == expected_text
-    assert response["part_metadata"]["element_id"] == element_id
-    assert response["part_metadata"]["filename"] == filename
-    assert response["part_metadata"]["filetype"] == filetype
-    assert response["part_metadata"]["page_number"] == page_number
+    assert document is not None
+    speech_parts = document["parts"]
+    assert speech_parts
+    first_part = speech_parts[0]
+    assert first_part["text"] == expected_text
+    part_metadata = first_part["metadata"]
+    assert part_metadata
+    assert part_metadata["element_id"] == element_id
+    assert part_metadata["filename"] == filename
+    assert part_metadata["filetype"] == filetype
+    assert part_metadata["page_number"] == page_number
 @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
+@lru_cache()
 def _get_jwt_token():
     """Connect to the server and get a JWT token."""
     customer_id = os.environ["VECTARA_CUSTOMER_ID"]
@@ -65,23 +71,12 @@ def _get_jwt_token():
     return response_json.get("access_token")
-def query_data(corpus_key: str, element_id: str) -> dict:
+def list_documents(corpus_key: str) -> list[str]:
-    url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
+    url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
     # the query below requires the corpus to have filter attributes for element_id
-    data = json.dumps(
-        {
-            "query": "string",
-            "search": {
-                "metadata_filter": f"part.element_id = '{element_id}'",
-                "lexical_interpolation": 1,
-                "limit": 10,
-            },
-        }
-    )
     jwt_token = _get_jwt_token()
     headers = {
         "Content-Type": "application/json",
@@ -90,11 +85,26 @@ def query_data(corpus_key: str, element_id: str) -> dict:
         "X-source": "unstructured",
     }
-    response = requests.post(url, headers=headers, data=data)
+    response = requests.get(url, headers=headers)
     response.raise_for_status()
     response_json = response.json()
+    documents = response_json.get("documents", [])
+    return documents
-    return response_json
+def fetch_document(corpus_key: str, documents_id: str) -> dict:
+    url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
+    jwt_token = _get_jwt_token()
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Authorization": f"Bearer {jwt_token}",
+        "X-source": "unstructured",
+    }
+    response = requests.get(url, headers=headers)
+    response.raise_for_status()
+    return response.json()
 def create_corpora(corpus_key: str, corpus_name: str) -> None:
@@ -148,8 +158,8 @@ def delete_corpora(corpus_key: str) -> None:
     response.raise_for_status()
-def list_corpora() -> list:
-    url = "https://api.vectara.io/v2/corpora?limit=100"
+def get_metadata(corpus_key: str):
+    url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
     jwt_token = _get_jwt_token()
     headers = {
         "Content-Type": "application/json",
@@ -159,35 +169,28 @@ def list_corpora() -> list:
     }
     response = requests.get(url, headers=headers)
     response.raise_for_status()
-    response_json = response.json()
-    if response_json.get("corpora"):
-        return [item["key"] for item in response_json.get("corpora")]
-    else:
-        return []
+    return response.json()
 def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
-    def is_ready_status():
-        corpora_list = list_corpora()
-        return corpus_key in corpora_list
     start = time.time()
-    is_ready = is_ready_status()
-    while not is_ready and time.time() - start < timeout:
-        time.sleep(interval)
-        is_ready = is_ready_status()
-    if not is_ready:
-        raise TimeoutError("time out waiting for corpus to be ready")
+    while time.time() - start < timeout:
+        try:
+            get_metadata(corpus_key)
+            return
+        except requests.HTTPError:
+            time.sleep(interval)
+    raise TimeoutError("time out waiting for corpus to be ready")
 def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
     start = time.time()
     while time.time() - start < timeout:
-        corpora_list = list_corpora()
-        if corpus_key not in corpora_list:
+        try:
+            get_metadata(corpus_key)
+            time.sleep(interval)
+        except requests.HTTPError:
             return
-        time.sleep(interval)
     raise TimeoutError("time out waiting for corpus to delete")
@@ -210,11 +213,23 @@ def corpora_util() -> Generator[str, None, None]:
         wait_for_delete(corpus_key=corpus_key)
+def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
+    start = time.time()
+    while time.time() - start < timeout:
+        all_document_meta = list_documents(corpus_key)
+        if not all_document_meta:
+            time.sleep(interval)
+            continue
+        else:
+            return all_document_meta
+    raise TimeoutError("time out waiting for document to be ready")
 @pytest.mark.asyncio
 @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
 @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
 async def test_vectara_destination(
-    upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
+    upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
 ):
     corpus_key = corpora_util
     connection_kwargs = {
@@ -231,7 +246,7 @@ async def test_vectara_destination(
         identifier="mock-file-data",
     )
-    stager_config = VectaraUploadStagerConfig(batch_size=10)
+    stager_config = VectaraUploadStagerConfig()
     stager = VectaraUploadStager(upload_stager_config=stager_config)
     new_upload_file = stager.run(
         elements_filepath=upload_file,
@@ -260,11 +275,8 @@ async def test_vectara_destination(
         elements = json.load(upload_fp)
     first_element = elements[0]
-    for i in range(retries):
-        response = query_data(corpus_key, first_element["element_id"])
-        if not response["search_results"]:
-            time.sleep(interval)
-        else:
-            break
-    validate_upload(response=response, expected_data=first_element)
+    all_document_meta = wait_for_doc_meta(corpus_key)
+    assert len(all_document_meta) == 1
+    document_meta = all_document_meta[0]
+    document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
+    validate_upload(document=document, expected_data=first_element)

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.21" # pragma: no cover
1	+ __version__ = "0.5.25" # pragma: no cover

unstructured_ingest/v2/interfaces/__init__.py CHANGED Viewed

@@ -1,6 +1,13 @@
+from unstructured_ingest.v2.types.file_data import (
+    BatchFileData,
+    BatchItem,
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
 from .connector import AccessConfig, BaseConnector, ConnectionConfig
 from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
-from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
 from .indexer import Indexer, IndexerConfig
 from .process import BaseProcess
 from .processor import ProcessorConfig

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -1,116 +1,13 @@
-import json
-from pathlib import Path
-from typing import Any, Optional
-from uuid import NAMESPACE_DNS, uuid5
-from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
-from unstructured_ingest.v2.logger import logger
-class SourceIdentifiers(BaseModel):
-    filename: str
-    fullpath: str
-    rel_path: Optional[str] = None
-    @property
-    def filename_stem(self) -> str:
-        return Path(self.filename).stem
-    @property
-    def relative_path(self) -> str:
-        return self.rel_path or self.fullpath
-class FileDataSourceMetadata(BaseModel):
-    url: Optional[str] = None
-    version: Optional[str] = None
-    record_locator: Optional[dict[str, Any]] = None
-    date_created: Optional[str] = None
-    date_modified: Optional[str] = None
-    date_processed: Optional[str] = None
-    permissions_data: Optional[list[dict[str, Any]]] = None
-    filesize_bytes: Optional[int] = None
-class FileData(BaseModel):
-    identifier: str
-    connector_type: str
-    source_identifiers: SourceIdentifiers
-    metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
-    additional_metadata: dict[str, Any] = Field(default_factory=dict)
-    reprocess: bool = False
-    local_download_path: Optional[str] = None
-    display_name: Optional[str] = None
-    @classmethod
-    def from_file(cls, path: str) -> "FileData":
-        path = Path(path).resolve()
-        if not path.exists() or not path.is_file():
-            raise ValueError(f"file path not valid: {path}")
-        with open(str(path.resolve()), "rb") as f:
-            file_data_dict = json.load(f)
-        file_data = cls.model_validate(file_data_dict)
-        return file_data
-    @classmethod
-    def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
-        file_data_dict = file_data.model_dump()
-        return cls.model_validate(file_data_dict, **kwargs)
-    def to_file(self, path: str) -> None:
-        path = Path(path).resolve()
-        path.parent.mkdir(parents=True, exist_ok=True)
-        with open(str(path.resolve()), "w") as f:
-            json.dump(self.model_dump(), f, indent=2)
-class BatchItem(BaseModel):
-    identifier: str
-    version: Optional[str] = None
-class BatchFileData(FileData):
-    identifier: str = Field(init=False)
-    batch_items: list[BatchItem]
-    source_identifiers: Optional[SourceIdentifiers] = None
-    @field_validator("batch_items")
-    @classmethod
-    def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
-        if not v:
-            raise ValueError("batch items cannot be empty")
-        all_identifiers = [item.identifier for item in v]
-        if len(all_identifiers) != len(set(all_identifiers)):
-            raise ValueError(f"duplicate identifiers: {all_identifiers}")
-        sorted_batch_items = sorted(v, key=lambda item: item.identifier)
-        return sorted_batch_items
-    @model_validator(mode="before")
-    @classmethod
-    def populate_identifier(cls, data: Any) -> Any:
-        if isinstance(data, dict) and "identifier" not in data:
-            batch_items = data["batch_items"]
-            identifier_data = json.dumps(
-                {item.identifier: item.version for item in batch_items}, sort_keys=True
-            )
-            data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
-        return data
-def file_data_from_file(path: str) -> FileData:
-    try:
-        return BatchFileData.from_file(path=path)
-    except ValidationError:
-        logger.debug(f"{path} not detected as batch file data")
-    return FileData.from_file(path=path)
-def file_data_from_dict(data: dict) -> FileData:
-    try:
-        return BatchFileData.model_validate(data)
-    except ValidationError:
-        logger.debug(f"{data} not valid for batch file data")
-    return FileData.model_validate(data)
+"""
+COMPATABILITY NOTICE:
+This file has moved to the v2/types/ module.
+The following line exists for backward compatibility.
+"""
+from unstructured_ingest.v2.types.file_data import *  # noqa - star imports are bad, but this is for maximal backward compatability
+#  Eventually this file should go away. Let's start warning users now:
+logger.warning(  # noqa - using logger from the star import
+    "Importing file_data.py through interfaces is deprecated. "
+    "Please use unstructured_ingest.v2.types.file_data instead!"
+)

unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py CHANGED Viewed

@@ -1,13 +1,13 @@
-import json
 import os
+import tempfile
 from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Generator
+from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field
-from unstructured_ingest.utils.data_prep import write_data
+from unstructured_ingest.utils.data_prep import get_data_df, write_data
 from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
@@ -22,6 +22,9 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
 CONNECTOR_TYPE = "databricks_volume_delta_tables"
+if TYPE_CHECKING:
+    from pandas import DataFrame
 class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
     database: str = Field(description="Database name", default="default")
@@ -30,10 +33,12 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
 @dataclass
 class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
-    def write_output(self, output_path: Path, data: list[dict]) -> None:
+    def write_output(self, output_path: Path, data: list[dict]) -> Path:
         # To avoid new line issues when migrating from volumes into delta tables, omit indenting
         # and always write it as a json file
-        write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
+        final_output_path = output_path.with_suffix(".json")
+        write_data(path=final_output_path, data=data, indent=None)
+        return final_output_path
 @dataclass
@@ -41,6 +46,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
     connection_config: DatabricksDeltaTablesConnectionConfig
     upload_config: DatabricksVolumeDeltaTableUploaderConfig
     connector_type: str = CONNECTOR_TYPE
+    _columns: Optional[dict[str, str]] = None
     def precheck(self) -> None:
         with self.connection_config.get_cursor() as cursor:
@@ -84,20 +90,58 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
             cursor.execute(f"USE DATABASE {self.upload_config.database}")
             yield cursor
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
-            catalog_path = self.get_output_path(file_data=file_data)
-            logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
-            cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
-            logger.debug(
-                f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
+    def get_table_columns(self) -> dict[str, str]:
+        if self._columns is None:
+            with self.get_cursor() as cursor:
+                cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
+                self._columns = {desc[0]: desc[1] for desc in cursor.description}
+        return self._columns
+    def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
+        import pandas as pd
+        table_columns = self.get_table_columns()
+        columns = set(df.columns)
+        schema_fields = set(table_columns.keys())
+        columns_to_drop = columns - schema_fields
+        missing_columns = schema_fields - columns
+        if columns_to_drop:
+            logger.info(
+                "Following columns will be dropped to match the table's schema: "
+                f"{', '.join(columns_to_drop)}"
+            )
+        if missing_columns and add_missing_columns:
+            logger.info(
+                "Following null filled columns will be added to match the table's schema:"
+                f" {', '.join(missing_columns)} "
             )
-            with path.open() as f:
-                data = json.load(f)
-                columns = data[0].keys()
-            column_str = ", ".join(columns)
-            sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`"  # noqa: E501
-            cursor.execute(sql_statment)
+        df = df.drop(columns=columns_to_drop)
+        if add_missing_columns:
+            for column in missing_columns:
+                df[column] = pd.Series()
+        return df
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            df = get_data_df()
+            df = self._fit_to_schema(df=df)
+            temp_path = Path(temp_dir) / path.name
+            df.to_json(temp_path, orient="records", lines=False)
+            with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
+                catalog_path = self.get_output_path(file_data=file_data)
+                logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
+                cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
+                logger.debug(
+                    f"migrating content from {catalog_path} to "
+                    f"table {self.upload_config.table_name}"
+                )
+                columns = list(df.columns)
+                column_str = ", ".join(columns)
+                sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`"  # noqa: E501
+                cursor.execute(sql_statment)
 databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, Any, Generator, Optional, Union
-from pydantic import BaseModel, Field, Secret, SecretStr
+from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
 from unstructured_ingest.error import (
     DestinationConnectionError,
@@ -98,6 +98,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
     ca_certs: Optional[Path] = None
     access_config: Secret[ElasticsearchAccessConfig]
+    @field_validator("hosts", mode="before")
+    def to_list(cls, value):
+        if isinstance(value, str):
+            return [value]
+        return value
     def get_client_kwargs(self) -> dict:
         # Update auth related fields to conform to what the SDK expects based on the
         # supported methods:

unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py CHANGED Viewed

@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
-from pydantic import BaseModel, Field, Secret
+from pydantic import BaseModel, Field, Secret, field_validator
 from unstructured_ingest.error import (
     DestinationConnectionError,
@@ -78,6 +78,12 @@ class OpenSearchConnectionConfig(ConnectionConfig):
     access_config: Secret[OpenSearchAccessConfig]
+    @field_validator("hosts", mode="before")
+    def to_list(cls, value):
+        if isinstance(value, str):
+            return [value]
+        return value
     def get_client_kwargs(self) -> dict:
         # Update auth related fields to conform to what the SDK expects based on the
         # supported methods:

unstructured_ingest/v2/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -33,6 +33,9 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
 CONNECTOR_TYPE = "s3"
+# https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters
+CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
 if TYPE_CHECKING:
     from s3fs import S3FileSystem
@@ -91,7 +94,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
         if isinstance(e, PermissionError):
             return UserAuthError(e)
         if isinstance(e, FileNotFoundError):
-            return UserError(e)
+            return UserError(f"File not found: {e}")
         if cause := getattr(e, "__cause__", None):
             error_response = cause.response
             error_meta = error_response["ResponseMetadata"]
@@ -140,6 +143,12 @@ class S3Indexer(FsspecIndexer):
         }
         if metadata:
             record_locator["metadata"] = metadata
+        issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
+        if issue_characters:
+            logger.warning(
+                f"File path {path} contains characters "
+                f"that can cause issues with S3: {issue_characters}"
+            )
         return FileDataSourceMetadata(
             date_created=date_created,
             date_modified=date_modified,

unstructured_ingest/v2/processes/connectors/sql/sql.py CHANGED Viewed

@@ -251,8 +251,9 @@ class SQLUploadStager(UploadStager):
             df[column] = df[column].apply(str)
         return df
-    def write_output(self, output_path: Path, data: list[dict]) -> None:
+    def write_output(self, output_path: Path, data: list[dict]) -> Path:
         write_data(path=output_path, data=data)
+        return output_path
     def run(
         self,
@@ -278,8 +279,10 @@ class SQLUploadStager(UploadStager):
         output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
         output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
-        self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
-        return output_path
+        final_output_path = self.write_output(
+            output_path=output_path, data=df.to_dict(orient="records")
+        )
+        return final_output_path
 class SQLUploaderConfig(UploaderConfig):

unstructured_ingest/v2/types/__init__.py ADDED Viewed

File without changes

unstructured_ingest/v2/types/file_data.py ADDED Viewed

@@ -0,0 +1,116 @@
+import json
+from pathlib import Path
+from typing import Any, Optional
+from uuid import NAMESPACE_DNS, uuid5
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
+from unstructured_ingest.v2.logger import logger
+class SourceIdentifiers(BaseModel):
+    filename: str
+    fullpath: str
+    rel_path: Optional[str] = None
+    @property
+    def filename_stem(self) -> str:
+        return Path(self.filename).stem
+    @property
+    def relative_path(self) -> str:
+        return self.rel_path or self.fullpath
+class FileDataSourceMetadata(BaseModel):
+    url: Optional[str] = None
+    version: Optional[str] = None
+    record_locator: Optional[dict[str, Any]] = None
+    date_created: Optional[str] = None
+    date_modified: Optional[str] = None
+    date_processed: Optional[str] = None
+    permissions_data: Optional[list[dict[str, Any]]] = None
+    filesize_bytes: Optional[int] = None
+class FileData(BaseModel):
+    identifier: str
+    connector_type: str
+    source_identifiers: SourceIdentifiers
+    metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
+    additional_metadata: dict[str, Any] = Field(default_factory=dict)
+    reprocess: bool = False
+    local_download_path: Optional[str] = None
+    display_name: Optional[str] = None
+    @classmethod
+    def from_file(cls, path: str) -> "FileData":
+        path = Path(path).resolve()
+        if not path.exists() or not path.is_file():
+            raise ValueError(f"file path not valid: {path}")
+        with open(str(path.resolve()), "rb") as f:
+            file_data_dict = json.load(f)
+        file_data = cls.model_validate(file_data_dict)
+        return file_data
+    @classmethod
+    def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
+        file_data_dict = file_data.model_dump()
+        return cls.model_validate(file_data_dict, **kwargs)
+    def to_file(self, path: str) -> None:
+        path = Path(path).resolve()
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(str(path.resolve()), "w") as f:
+            json.dump(self.model_dump(), f, indent=2)
+class BatchItem(BaseModel):
+    identifier: str
+    version: Optional[str] = None
+class BatchFileData(FileData):
+    identifier: str = Field(init=False)
+    batch_items: list[BatchItem]
+    source_identifiers: Optional[SourceIdentifiers] = None
+    @field_validator("batch_items")
+    @classmethod
+    def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
+        if not v:
+            raise ValueError("batch items cannot be empty")
+        all_identifiers = [item.identifier for item in v]
+        if len(all_identifiers) != len(set(all_identifiers)):
+            raise ValueError(f"duplicate identifiers: {all_identifiers}")
+        sorted_batch_items = sorted(v, key=lambda item: item.identifier)
+        return sorted_batch_items
+    @model_validator(mode="before")
+    @classmethod
+    def populate_identifier(cls, data: Any) -> Any:
+        if isinstance(data, dict) and "identifier" not in data:
+            batch_items = data["batch_items"]
+            identifier_data = json.dumps(
+                {item.identifier: item.version for item in batch_items}, sort_keys=True
+            )
+            data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
+        return data
+def file_data_from_file(path: str) -> FileData:
+    try:
+        return BatchFileData.from_file(path=path)
+    except ValidationError:
+        logger.debug(f"{path} not detected as batch file data")
+    return FileData.from_file(path=path)
+def file_data_from_dict(data: dict) -> FileData:
+    try:
+        return BatchFileData.model_validate(data)
+    except ValidationError:
+        logger.debug(f"{data} not valid for batch file data")
+    return FileData.model_validate(data)

{unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: unstructured-ingest
-Version: 0.5.21
+Version: 0.5.25
 Summary: A library that prepares raw documents for downstream ML tasks.
 Home-page: https://github.com/Unstructured-IO/unstructured-ingest
 Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9.0,<3.14
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
+Requires-Dist: opentelemetry-sdk
+Requires-Dist: python-dateutil
 Requires-Dist: click
 Requires-Dist: dataclasses_json
-Requires-Dist: pydantic>=2.7
-Requires-Dist: python-dateutil
-Requires-Dist: opentelemetry-sdk
 Requires-Dist: tqdm
+Requires-Dist: pydantic>=2.7
 Requires-Dist: numpy
 Requires-Dist: pandas
 Provides-Extra: remote
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
 Requires-Dist: numpy; extra == "astradb"
 Requires-Dist: pandas; extra == "astradb"
 Provides-Extra: azure
-Requires-Dist: fsspec; extra == "azure"
 Requires-Dist: adlfs; extra == "azure"
+Requires-Dist: fsspec; extra == "azure"
 Requires-Dist: numpy; extra == "azure"
 Requires-Dist: pandas; extra == "azure"
 Provides-Extra: azure-ai-search
@@ -112,13 +112,13 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
 Requires-Dist: numpy; extra == "azure-ai-search"
 Requires-Dist: pandas; extra == "azure-ai-search"
 Provides-Extra: biomed
-Requires-Dist: bs4; extra == "biomed"
 Requires-Dist: requests; extra == "biomed"
+Requires-Dist: bs4; extra == "biomed"
 Requires-Dist: numpy; extra == "biomed"
 Requires-Dist: pandas; extra == "biomed"
 Provides-Extra: box
-Requires-Dist: fsspec; extra == "box"
 Requires-Dist: boxfs; extra == "box"
+Requires-Dist: fsspec; extra == "box"
 Requires-Dist: numpy; extra == "box"
 Requires-Dist: pandas; extra == "box"
 Provides-Extra: chroma
@@ -148,8 +148,8 @@ Requires-Dist: discord.py; extra == "discord"
 Requires-Dist: numpy; extra == "discord"
 Requires-Dist: pandas; extra == "discord"
 Provides-Extra: dropbox
-Requires-Dist: fsspec; extra == "dropbox"
 Requires-Dist: dropboxdrivefs; extra == "dropbox"
+Requires-Dist: fsspec; extra == "dropbox"
 Requires-Dist: numpy; extra == "dropbox"
 Requires-Dist: pandas; extra == "dropbox"
 Provides-Extra: duckdb
@@ -162,13 +162,13 @@ Requires-Dist: numpy; extra == "elasticsearch"
 Requires-Dist: pandas; extra == "elasticsearch"
 Provides-Extra: gcs
 Requires-Dist: bs4; extra == "gcs"
-Requires-Dist: fsspec; extra == "gcs"
 Requires-Dist: gcsfs; extra == "gcs"
+Requires-Dist: fsspec; extra == "gcs"
 Requires-Dist: numpy; extra == "gcs"
 Requires-Dist: pandas; extra == "gcs"
 Provides-Extra: github
-Requires-Dist: pygithub>1.58.0; extra == "github"
 Requires-Dist: requests; extra == "github"
+Requires-Dist: pygithub>1.58.0; extra == "github"
 Requires-Dist: numpy; extra == "github"
 Requires-Dist: pandas; extra == "github"
 Provides-Extra: gitlab
@@ -180,15 +180,15 @@ Requires-Dist: google-api-python-client; extra == "google-drive"
 Requires-Dist: numpy; extra == "google-drive"
 Requires-Dist: pandas; extra == "google-drive"
 Provides-Extra: hubspot
-Requires-Dist: urllib3; extra == "hubspot"
 Requires-Dist: hubspot-api-client; extra == "hubspot"
+Requires-Dist: urllib3; extra == "hubspot"
 Requires-Dist: numpy; extra == "hubspot"
 Requires-Dist: pandas; extra == "hubspot"
 Provides-Extra: ibm-watsonx-s3
+Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
 Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
 Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
 Requires-Dist: httpx; extra == "ibm-watsonx-s3"
-Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
 Requires-Dist: numpy; extra == "ibm-watsonx-s3"
 Requires-Dist: pandas; extra == "ibm-watsonx-s3"
 Provides-Extra: jira
@@ -217,21 +217,21 @@ Requires-Dist: numpy; extra == "mongodb"
 Requires-Dist: pandas; extra == "mongodb"
 Provides-Extra: neo4j
 Requires-Dist: networkx; extra == "neo4j"
-Requires-Dist: neo4j-rust-ext; extra == "neo4j"
 Requires-Dist: cymple; extra == "neo4j"
+Requires-Dist: neo4j-rust-ext; extra == "neo4j"
 Requires-Dist: numpy; extra == "neo4j"
 Requires-Dist: pandas; extra == "neo4j"
 Provides-Extra: notion
-Requires-Dist: httpx; extra == "notion"
-Requires-Dist: htmlBuilder; extra == "notion"
 Requires-Dist: notion-client; extra == "notion"
 Requires-Dist: backoff; extra == "notion"
+Requires-Dist: htmlBuilder; extra == "notion"
+Requires-Dist: httpx; extra == "notion"
 Requires-Dist: numpy; extra == "notion"
 Requires-Dist: pandas; extra == "notion"
 Provides-Extra: onedrive
-Requires-Dist: bs4; extra == "onedrive"
 Requires-Dist: msal; extra == "onedrive"
 Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
+Requires-Dist: bs4; extra == "onedrive"
 Requires-Dist: numpy; extra == "onedrive"
 Requires-Dist: pandas; extra == "onedrive"
 Provides-Extra: opensearch
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
 Requires-Dist: numpy; extra == "salesforce"
 Requires-Dist: pandas; extra == "salesforce"
 Provides-Extra: sftp
-Requires-Dist: fsspec; extra == "sftp"
 Requires-Dist: paramiko; extra == "sftp"
+Requires-Dist: fsspec; extra == "sftp"
 Requires-Dist: numpy; extra == "sftp"
 Requires-Dist: pandas; extra == "sftp"
 Provides-Extra: slack
@@ -287,8 +287,8 @@ Requires-Dist: slack_sdk[optional]; extra == "slack"
 Requires-Dist: numpy; extra == "slack"
 Requires-Dist: pandas; extra == "slack"
 Provides-Extra: snowflake
-Requires-Dist: psycopg2-binary; extra == "snowflake"
 Requires-Dist: snowflake-connector-python; extra == "snowflake"
+Requires-Dist: psycopg2-binary; extra == "snowflake"
 Requires-Dist: numpy; extra == "snowflake"
 Requires-Dist: pandas; extra == "snowflake"
 Provides-Extra: wikipedia
@@ -312,21 +312,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
 Requires-Dist: numpy; extra == "singlestore"
 Requires-Dist: pandas; extra == "singlestore"
 Provides-Extra: vectara
-Requires-Dist: httpx; extra == "vectara"
-Requires-Dist: aiofiles; extra == "vectara"
 Requires-Dist: requests; extra == "vectara"
+Requires-Dist: aiofiles; extra == "vectara"
+Requires-Dist: httpx; extra == "vectara"
 Requires-Dist: numpy; extra == "vectara"
 Requires-Dist: pandas; extra == "vectara"
 Provides-Extra: vastdb
-Requires-Dist: ibis; extra == "vastdb"
 Requires-Dist: pyarrow; extra == "vastdb"
+Requires-Dist: ibis; extra == "vastdb"
 Requires-Dist: vastdb; extra == "vastdb"
 Requires-Dist: numpy; extra == "vastdb"
 Requires-Dist: pandas; extra == "vastdb"
 Provides-Extra: zendesk
-Requires-Dist: bs4; extra == "zendesk"
-Requires-Dist: httpx; extra == "zendesk"
 Requires-Dist: aiofiles; extra == "zendesk"
+Requires-Dist: httpx; extra == "zendesk"
+Requires-Dist: bs4; extra == "zendesk"
 Requires-Dist: numpy; extra == "zendesk"
 Requires-Dist: pandas; extra == "zendesk"
 Provides-Extra: embed-huggingface
@@ -334,8 +334,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
 Requires-Dist: numpy; extra == "embed-huggingface"
 Requires-Dist: pandas; extra == "embed-huggingface"
 Provides-Extra: embed-octoai
-Requires-Dist: tiktoken; extra == "embed-octoai"
 Requires-Dist: openai; extra == "embed-octoai"
+Requires-Dist: tiktoken; extra == "embed-octoai"
 Requires-Dist: numpy; extra == "embed-octoai"
 Requires-Dist: pandas; extra == "embed-octoai"
 Provides-Extra: embed-vertexai
@@ -351,13 +351,13 @@ Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
 Requires-Dist: numpy; extra == "embed-mixedbreadai"
 Requires-Dist: pandas; extra == "embed-mixedbreadai"
 Provides-Extra: openai
-Requires-Dist: tiktoken; extra == "openai"
 Requires-Dist: openai; extra == "openai"
+Requires-Dist: tiktoken; extra == "openai"
 Requires-Dist: numpy; extra == "openai"
 Requires-Dist: pandas; extra == "openai"
 Provides-Extra: bedrock
-Requires-Dist: aioboto3; extra == "bedrock"
 Requires-Dist: boto3; extra == "bedrock"
+Requires-Dist: aioboto3; extra == "bedrock"
 Requires-Dist: numpy; extra == "bedrock"
 Requires-Dist: pandas; extra == "bedrock"
 Provides-Extra: togetherai

{unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/RECORD RENAMED Viewed

@@ -24,7 +24,7 @@ test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfv
 test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
 test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
 test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
-test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
+test/integration/connectors/test_vectara.py,sha256=thM9vIWn7vcH1xjQK3owuEJMr65Z7L4j7NICsMpsMv8,9290
 test/integration/connectors/test_zendesk.py,sha256=nMBVNlEQr1uvmI1fzUC1bmoa2doXnYp5n4bMJS2FN-o,3727
 test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
@@ -113,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
 test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=b5BrQJjlBZoPiM_J1cJDbJABGvcwaDFb_Bvwb0AHN10,43
+unstructured_ingest/__version__.py,sha256=A9I2h_N6BTgmKRhQ1HbPOAJuwdOFgMb_aDmK1czvHyc,43
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
 unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -398,10 +398,10 @@ unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdj
 unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
 unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
-unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADAUBiPAY88PKaMRyqY,1005
+unstructured_ingest/v2/interfaces/__init__.py,sha256=Jn5qtWOnmBZzsb2PoQYN3Xj5xHa9thSVc0BEoIN0Pw0,1059
 unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
 unstructured_ingest/v2/interfaces/downloader.py,sha256=Qi_wISgUACZKEPu5p1kUaG3uiCXcr3zWg9z9uRDwoOk,2927
-unstructured_ingest/v2/interfaces/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
+unstructured_ingest/v2/interfaces/file_data.py,sha256=DQYzXr8yjlm6VkGuwQLGJ1sia4Gr0d__POAFLrow1PE,525
 unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
 unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
 unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
@@ -462,21 +462,21 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6q
 unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
 unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
 unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
-unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=FZhjrMYBr_je6mWYp7MUUvyKR9YwGD2HiNljeT7U5ws,5044
+unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=0kEtIVQSD6RhLAqpc-0BNFQazS7lnsnWalaN3Mdn97g,6805
 unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
 unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=o3J81DnSwt3lmAh19jXVPAYRZLJ3VyGhaEVO2SIjksQ,2926
 unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=NIo2CCiPiuTFotNC891Mbelzg01knItryYGUtOM96xg,4393
 unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=RW-Cw94Hs3ZsN8Kb4ciSh_N-Qkp0cqkw_xkJbt8CDNU,4656
 unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
-unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
-unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
+unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=KmlQCA7LXppxhL9e27LBBqNT999nUcc39qe2IkZsUJ8,18988
+unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=tzOV0eNMyVHMXE5nedp6u0yyWC0Gn_blklg2ZdoOa4c,6956
 unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
 unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=kw0UfGI2fx3oQ8jVpzF45pH8Qg_QP_que5C_VXgnktc,7156
 unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdWEQe704Cm4UHv-ukTXV2bT3SBENVk,5881
 unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
 unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
 unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
-unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
+unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=MtD41jZQXB-fqNzW3Whqq6ydQYDUK6Jub7sSPvgLErw,7130
 unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
 unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
 unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
@@ -568,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
 unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
 unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
 unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=r2qgoEF3bUugzgSr3hMJyIm8DKmxsO53ZHXJSNxOsvE,9379
-unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=G28VUR0zaMVmQtbdZG6TRpkWFHvXJqFrr7SBuyM-fME,15608
+unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=CbysCnBBHtmYkqXiaoZSazI1ombNltrsqFrY-gQzm4U,15683
 unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=6RoBUxMbeuhduvTFlBKMgEH1NKJg7doQjXF_R5cUuX0,5319
 unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=wklJ8p3eMb81FTjS6ukPoILuWN0_KQBfuYGXfE0XrqY,9644
 unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
@@ -581,9 +581,11 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
 unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
 unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
-unstructured_ingest-0.5.21.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-0.5.21.dist-info/METADATA,sha256=c1bUHvgG6X9QOiAD669sVHAFkGfI2tBTRBM-eRJBLiU,14999
-unstructured_ingest-0.5.21.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-unstructured_ingest-0.5.21.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-0.5.21.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
-unstructured_ingest-0.5.21.dist-info/RECORD,,
+unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
+unstructured_ingest-0.5.25.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-0.5.25.dist-info/METADATA,sha256=Z_PvUmam-C56UwoY92VhbvUd-fubXBHevjSMHKVgPx4,14999
+unstructured_ingest-0.5.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+unstructured_ingest-0.5.25.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-0.5.25.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
+unstructured_ingest-0.5.25.dist-info/RECORD,,

{unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/top_level.txt RENAMED Viewed

File without changes

unstructured-ingest 0.5.21__py3-none-any.whl → 0.5.25__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.5.21py3-none-any.whl → 0.5.25py3-none-any.whl