PyPI - unstructured-ingest - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

unstructured-ingest 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show

test/integration/connectors/sql/test_postgres.py CHANGED Viewed

@@ -28,7 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
     PostgresUploadStager,
 )
-SEED_DATA_ROWS = 20
+SEED_DATA_ROWS = 10
 @pytest.fixture
@@ -69,7 +69,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
     )
     indexer = PostgresIndexer(
         connection_config=connection_config,
-        index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
+        index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
     )
     downloader = PostgresDownloader(
         connection_config=connection_config,
@@ -81,7 +81,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
         configs=SourceValidationConfigs(
             test_id="postgres",
             expected_num_files=SEED_DATA_ROWS,
-            expected_number_indexed_file_data=4,
+            expected_number_indexed_file_data=2,
             validate_downloaded_files=True,
         ),
     )

test/integration/connectors/sql/test_singlestore.py CHANGED Viewed

@@ -29,7 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
     SingleStoreUploadStager,
 )
-SEED_DATA_ROWS = 20
+SEED_DATA_ROWS = 10
 @pytest.fixture
@@ -66,7 +66,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
     )
     indexer = SingleStoreIndexer(
         connection_config=connection_config,
-        index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
+        index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
     )
     downloader = SingleStoreDownloader(
         connection_config=connection_config,
@@ -80,7 +80,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
         configs=SourceValidationConfigs(
             test_id="singlestore",
             expected_num_files=SEED_DATA_ROWS,
-            expected_number_indexed_file_data=4,
+            expected_number_indexed_file_data=2,
             validate_downloaded_files=True,
         ),
     )

test/integration/connectors/sql/test_sqlite.py CHANGED Viewed

@@ -27,7 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
     SQLiteUploadStager,
 )
-SEED_DATA_ROWS = 20
+SEED_DATA_ROWS = 10
 @pytest.fixture
@@ -57,7 +57,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
     connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
     indexer = SQLiteIndexer(
         connection_config=connection_config,
-        index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
+        index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
     )
     downloader = SQLiteDownloader(
         connection_config=connection_config,
@@ -69,7 +69,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
         configs=SourceValidationConfigs(
             test_id="sqlite",
             expected_num_files=SEED_DATA_ROWS,
-            expected_number_indexed_file_data=4,
+            expected_number_indexed_file_data=2,
             validate_downloaded_files=True,
         ),
     )

test/integration/connectors/test_astradb.py CHANGED Viewed

@@ -14,12 +14,18 @@ from test.integration.connectors.utils.validation.destination import (
     StagerValidationConfigs,
     stager_validation,
 )
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
+    source_connector_validation,
+)
 from test.integration.utils import requires_env
 from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.astradb import (
     CONNECTOR_TYPE,
     AstraDBAccessConfig,
     AstraDBConnectionConfig,
+    AstraDBDownloader,
+    AstraDBDownloaderConfig,
     AstraDBIndexer,
     AstraDBIndexerConfig,
     AstraDBUploader,
@@ -110,6 +116,40 @@ def collection(upload_file: Path) -> Collection:
         astra_db.drop_collection(collection)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
+async def test_astra_search_source(
+    tmp_path: Path,
+):
+    env_data = get_env_data()
+    collection_name = "ingest_test_src"
+    connection_config = AstraDBConnectionConfig(
+        access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
+    )
+    indexer = AstraDBIndexer(
+        index_config=AstraDBIndexerConfig(
+            collection_name=collection_name,
+        ),
+        connection_config=connection_config,
+    )
+    downloader = AstraDBDownloader(
+        connection_config=connection_config,
+        download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
+    )
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id=CONNECTOR_TYPE,
+            expected_num_files=5,
+            expected_number_indexed_file_data=1,
+            validate_downloaded_files=True,
+        ),
+    )
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
 @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")

test/integration/connectors/test_kafka.py CHANGED Viewed

@@ -122,7 +122,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
             indexer=indexer,
             downloader=downloader,
             configs=SourceValidationConfigs(
-                test_id="kafka", expected_num_files=5, validate_downloaded_files=True
+                test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
             ),
         )
@@ -204,7 +204,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
             indexer=indexer,
             downloader=downloader,
             configs=SourceValidationConfigs(
-                test_id="kafka",
+                test_id="kafka-cloud",
                 exclude_fields_extend=["connector_type"],
                 expected_num_files=expected_messages,
                 validate_downloaded_files=True,

test/integration/connectors/test_mongodb.py CHANGED Viewed

@@ -197,7 +197,10 @@ async def test_mongodb_source(temp_dir: Path):
         indexer=indexer,
         downloader=downloader,
         configs=SourceValidationConfigs(
-            test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
+            test_id=CONNECTOR_TYPE,
+            expected_num_files=4,
+            validate_downloaded_files=True,
+            expected_number_indexed_file_data=1,
         ),
     )

test/integration/connectors/utils/validation/source.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import json
 import os
 import shutil
-from dataclasses import replace
 from pathlib import Path
 from typing import Callable, Optional
 from deepdiff import DeepDiff
 from pydantic import Field
-from test.integration.connectors.utils.validation.utils import ValidationConfig, reset_dir
+from test.integration.connectors.utils.validation.utils import ValidationConfig
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
@@ -92,7 +91,7 @@ def check_contents(
         file_data_path = expected_output_dir / f"{file_data.identifier}.json"
         with file_data_path.open("r") as file:
             expected_file_data_contents = json.load(file)
-        current_file_data_contents = file_data.to_dict()
+        current_file_data_contents = file_data.model_dump()
         expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
         current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
         diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -160,9 +159,11 @@ def update_fixtures(
     save_filedata: bool = True,
 ):
     # Rewrite the current file data
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True)
     if save_filedata:
         file_data_output_path = output_dir / "file_data"
-        reset_dir(dir_path=file_data_output_path)
+        shutil.rmtree(path=file_data_output_path, ignore_errors=True)
         print(
             f"Writing {len(all_file_data)} file data to "
             f"saved fixture location {file_data_output_path}"
@@ -171,7 +172,7 @@ def update_fixtures(
         for file_data in all_file_data:
             file_data_path = file_data_output_path / f"{file_data.identifier}.json"
             with file_data_path.open(mode="w") as f:
-                json.dump(file_data.to_dict(), f, indent=2)
+                json.dump(file_data.model_dump(), f, indent=2)
     # Record file structure of download directory
     download_files = get_files(dir_path=download_dir)
@@ -183,7 +184,7 @@ def update_fixtures(
     # If applicable, save raw downloads
     if save_downloads:
         raw_download_output_path = output_dir / "downloads"
-        reset_dir(raw_download_output_path)
+        shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
         print(
             f"Writing {len(download_files)} downloaded files to "
             f"saved fixture location {raw_download_output_path}"
@@ -213,7 +214,10 @@ def run_all_validations(
     if configs.validate_file_data:
         run_expected_results_validation(
             expected_output_dir=test_output_dir / "file_data",
-            all_file_data=postdownload_file_data,
+            all_file_data=get_all_file_data(
+                all_predownload_file_data=predownload_file_data,
+                all_postdownload_file_data=postdownload_file_data,
+            ),
             configs=configs,
         )
     download_files = get_files(dir_path=download_dir)
@@ -229,6 +233,19 @@ def run_all_validations(
         )
+def get_all_file_data(
+    all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
+) -> list[FileData]:
+    all_file_data = all_postdownload_file_data
+    indexed_file_data = [
+        fd
+        for fd in all_predownload_file_data
+        if fd.identifier not in [f.identifier for f in all_file_data]
+    ]
+    all_file_data += indexed_file_data
+    return all_file_data
 async def source_connector_validation(
     indexer: Indexer,
     downloader: Downloader,
@@ -246,7 +263,7 @@ async def source_connector_validation(
     test_output_dir = configs.test_output_dir()
     for file_data in indexer.run():
         assert file_data
-        predownload_file_data = replace(file_data)
+        predownload_file_data = file_data.model_copy(deep=True)
         all_predownload_file_data.append(predownload_file_data)
         if downloader.is_async():
             resp = await downloader.run_async(file_data=file_data)
@@ -254,10 +271,10 @@ async def source_connector_validation(
             resp = downloader.run(file_data=file_data)
         if isinstance(resp, list):
             for r in resp:
-                postdownload_file_data = replace(r["file_data"])
+                postdownload_file_data = r["file_data"].model_copy(deep=True)
                 all_postdownload_file_data.append(postdownload_file_data)
         else:
-            postdownload_file_data = replace(resp["file_data"])
+            postdownload_file_data = resp["file_data"].model_copy(deep=True)
             all_postdownload_file_data.append(postdownload_file_data)
     if not overwrite_fixtures:
         print("Running validation")
@@ -273,7 +290,10 @@ async def source_connector_validation(
         update_fixtures(
             output_dir=test_output_dir,
             download_dir=download_dir,
-            all_file_data=all_postdownload_file_data,
+            all_file_data=get_all_file_data(
+                all_predownload_file_data=all_predownload_file_data,
+                all_postdownload_file_data=all_postdownload_file_data,
+            ),
             save_downloads=configs.validate_downloaded_files,
             save_filedata=configs.validate_file_data,
         )

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.9" # pragma: no cover
1	+ __version__ = "0.3.10" # pragma: no cover

unstructured_ingest/v2/interfaces/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .connector import AccessConfig, BaseConnector, ConnectionConfig
 from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
-from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
+from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
 from .indexer import Indexer, IndexerConfig
 from .process import BaseProcess
 from .processor import ProcessorConfig
@@ -27,4 +27,6 @@ __all__ = [
     "ConnectionConfig",
     "BaseConnector",
     "FileDataSourceMetadata",
+    "BatchFileData",
+    "BatchItem",
 ]

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import json
-from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Literal, Optional
+from typing import Any, Optional
+from uuid import NAMESPACE_DNS, uuid5
-from dataclasses_json import DataClassJsonMixin
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
+from unstructured_ingest.v2.logger import logger
-@dataclass
-class SourceIdentifiers:
+class SourceIdentifiers(BaseModel):
     filename: str
     fullpath: str
     rel_path: Optional[str] = None
@@ -21,8 +22,7 @@ class SourceIdentifiers:
         return self.rel_path or self.fullpath
-@dataclass
-class FileDataSourceMetadata(DataClassJsonMixin):
+class FileDataSourceMetadata(BaseModel):
     url: Optional[str] = None
     version: Optional[str] = None
     record_locator: Optional[dict[str, Any]] = None
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
     filesize_bytes: Optional[int] = None
-@dataclass
-class FileData(DataClassJsonMixin):
+class FileData(BaseModel):
     identifier: str
     connector_type: str
     source_identifiers: Optional[SourceIdentifiers] = None
-    doc_type: Literal["file", "batch"] = field(default="file")
-    metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
-    additional_metadata: dict[str, Any] = field(default_factory=dict)
+    metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
+    additional_metadata: dict[str, Any] = Field(default_factory=dict)
     reprocess: bool = False
     local_download_path: Optional[str] = None
     display_name: Optional[str] = None
@@ -52,11 +50,57 @@ class FileData(DataClassJsonMixin):
             raise ValueError(f"file path not valid: {path}")
         with open(str(path.resolve()), "rb") as f:
             file_data_dict = json.load(f)
-        file_data = FileData.from_dict(file_data_dict)
+        file_data = cls.model_validate(file_data_dict)
         return file_data
+    @classmethod
+    def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
+        file_data_dict = file_data.model_dump()
+        return cls.model_validate(file_data_dict, **kwargs)
     def to_file(self, path: str) -> None:
         path = Path(path).resolve()
         path.parent.mkdir(parents=True, exist_ok=True)
         with open(str(path.resolve()), "w") as f:
-            json.dump(self.to_dict(), f, indent=2)
+            json.dump(self.model_dump(), f, indent=2)
+class BatchItem(BaseModel):
+    identifier: str
+    version: Optional[str] = None
+class BatchFileData(FileData):
+    identifier: str = Field(init=False)
+    batch_items: list[BatchItem]
+    @field_validator("batch_items")
+    @classmethod
+    def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
+        if not v:
+            raise ValueError("batch items cannot be empty")
+        all_identifiers = [item.identifier for item in v]
+        if len(all_identifiers) != len(set(all_identifiers)):
+            raise ValueError(f"duplicate identifiers: {all_identifiers}")
+        sorted_batch_items = sorted(v, key=lambda item: item.identifier)
+        return sorted_batch_items
+    @model_validator(mode="before")
+    @classmethod
+    def populate_identifier(cls, data: Any) -> Any:
+        if isinstance(data, dict) and "identifier" not in data:
+            batch_items = data["batch_items"]
+            identifier_data = json.dumps(
+                {item.identifier: item.version for item in batch_items}, sort_keys=True
+            )
+            data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
+        return data
+def file_data_from_file(path: str) -> FileData:
+    try:
+        return BatchFileData.from_file(path=path)
+    except ValidationError:
+        logger.debug(f"{path} not valid for batch file data")
+    return FileData.from_file(path=path)

unstructured_ingest/v2/pipeline/steps/chunk.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Callable, Optional, TypedDict
 from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
 from unstructured_ingest.v2.processes.chunker import Chunker
@@ -51,7 +52,7 @@ class ChunkStep(PipelineStep):
         self, fn: Callable, path: str, file_data_path: str, **kwargs
     ) -> ChunkStepResponse:
         path = Path(path)
-        file_data = FileData.from_file(path=file_data_path)
+        file_data = file_data_from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=path)
         if not self.should_chunk(filepath=output_filepath, file_data=file_data):
             logger.debug(f"skipping chunking, output already exists: {output_filepath}")

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Callable, Optional, TypedDict, TypeVar
 from unstructured_ingest.v2.interfaces import FileData, download_responses
 from unstructured_ingest.v2.interfaces.downloader import Downloader
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
 from unstructured_ingest.v2.utils import serialize_base_model_json
@@ -87,12 +88,12 @@ class DownloadStep(PipelineStep):
                 f"match size of local file: {file_size_bytes}, updating"
             )
             file_data.metadata.filesize_bytes = file_size_bytes
-        logger.debug(f"updating file data with new content: {file_data.to_dict()}")
+        logger.debug(f"updating file data with new content: {file_data.model_dump()}")
         with file_data_path.open("w") as file:
-            json.dump(file_data.to_dict(), file, indent=2)
+            json.dump(file_data.model_dump(), file, indent=2)
     async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
-        file_data = FileData.from_file(path=file_data_path)
+        file_data = file_data_from_file(path=file_data_path)
         download_path = self.process.get_download_path(file_data=file_data)
         if not self.should_download(file_data=file_data, file_data_path=file_data_path):
             logger.debug(f"skipping download, file already exists locally: {download_path}")
@@ -172,7 +173,7 @@ class DownloadStep(PipelineStep):
         filepath = (self.cache_dir / filename).resolve()
         filepath.parent.mkdir(parents=True, exist_ok=True)
         with open(str(filepath), "w") as f:
-            json.dump(file_data.to_dict(), f, indent=2)
+            json.dump(file_data.model_dump(), f, indent=2)
         return str(filepath)
     def get_hash(self, extras: Optional[list[str]]) -> str:

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Callable, Optional, TypedDict
 from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
 from unstructured_ingest.v2.processes.embedder import Embedder
@@ -49,7 +50,7 @@ class EmbedStep(PipelineStep):
     async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
         path = Path(path)
-        file_data = FileData.from_file(path=file_data_path)
+        file_data = file_data_from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=path)
         if not self.should_embed(filepath=output_filepath, file_data=file_data):
             logger.debug(f"skipping embedding, output already exists: {output_filepath}")

unstructured_ingest/v2/pipeline/steps/filter.py CHANGED Viewed

@@ -2,7 +2,7 @@ import asyncio
 from dataclasses import dataclass
 from typing import Callable, Optional
-from unstructured_ingest.v2.interfaces.file_data import FileData
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
 from unstructured_ingest.v2.processes.filter import Filterer
@@ -20,7 +20,7 @@ class FilterStep(PipelineStep):
         logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
-        file_data = FileData.from_file(path=file_data_path)
+        file_data = file_data_from_file(path=file_data_path)
         fn_kwargs = {"file_data": file_data}
         if not asyncio.iscoroutinefunction(fn):
             resp = fn(**fn_kwargs)

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
     @instrument(span_name=STEP_ID)
     def run(self) -> Generator[str, None, None]:
         for file_data in self.process.run():
-            logger.debug(f"generated file data: {file_data.to_dict()}")
+            logger.debug(f"generated file data: {file_data.model_dump()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"
                 filepath = (self.cache_dir / filename).resolve()
                 filepath.parent.mkdir(parents=True, exist_ok=True)
                 with open(str(filepath), "w") as f:
-                    json.dump(file_data.to_dict(), f, indent=2)
+                    json.dump(file_data.model_dump(), f, indent=2)
                 yield str(filepath)
             except Exception as e:
                 logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
     async def run_async(self) -> AsyncGenerator[str, None]:
         async for file_data in self.process.run_async():
-            logger.debug(f"generated file data: {file_data.to_dict()}")
+            logger.debug(f"generated file data: {file_data.model_dump()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"
                 filepath = (self.cache_dir / filename).resolve()
                 filepath.parent.mkdir(parents=True, exist_ok=True)
                 with open(str(filepath), "w") as f:
-                    json.dump(file_data.to_dict(), f, indent=2)
+                    json.dump(file_data.model_dump(), f, indent=2)
                 yield str(filepath)
             except Exception as e:
                 logger.error(f"failed to create index for file data: {file_data}", exc_info=True)

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Callable, Optional, TypedDict
 from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
 from unstructured_ingest.v2.processes.partitioner import Partitioner
@@ -51,12 +52,12 @@ class PartitionStep(PipelineStep):
         self, fn: Callable, path: str, file_data_path: str
     ) -> Optional[PartitionStepResponse]:
         path = Path(path)
-        file_data = FileData.from_file(path=file_data_path)
+        file_data = file_data_from_file(path=file_data_path)
         output_filepath = self.get_output_filepath(filename=Path(file_data_path))
         if not self.should_partition(filepath=output_filepath, file_data=file_data):
             logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
             return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
-        fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
+        fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
         if not asyncio.iscoroutinefunction(fn):
             partitioned_content = fn(**fn_kwargs)
         elif semaphore := self.context.semaphore:

unstructured_ingest/v2/pipeline/steps/stage.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
-from unstructured_ingest.v2.interfaces.file_data import FileData
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
@@ -43,7 +43,7 @@ class UploadStageStep(PipelineStep):
         output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
         fn_kwargs = {
             "elements_filepath": path,
-            "file_data": FileData.from_file(path=file_data_path),
+            "file_data": file_data_from_file(path=file_data_path),
             "output_dir": self.cache_dir,
             "output_filename": output_filename,
         }

unstructured_ingest/v2/pipeline/steps/uncompress.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, TypedDict
-from unstructured_ingest.v2.interfaces.file_data import FileData
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
 from unstructured_ingest.v2.processes.uncompress import Uncompressor
@@ -28,7 +28,7 @@ class UncompressStep(PipelineStep):
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str
     ) -> list[UncompressStepResponse]:
-        file_data = FileData.from_file(path=file_data_path)
+        file_data = file_data_from_file(path=file_data_path)
         fn_kwargs = {"file_data": file_data}
         if not asyncio.iscoroutinefunction(fn):
             new_file_data = fn(**fn_kwargs)

unstructured_ingest/v2/pipeline/steps/upload.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
-from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.interfaces.uploader import UploadContent
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
@@ -41,14 +41,14 @@ class UploadStep(BatchPipelineStep):
     @instrument(span_name=STEP_ID)
     def _run_batch(self, contents: list[UploadStepContent]) -> None:
         upload_contents = [
-            UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"]))
+            UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
             for c in contents
         ]
         self.process.run_batch(contents=upload_contents)
     async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
         fn = fn or self.process.run_async
-        fn_kwargs = {"path": Path(path), "file_data": FileData.from_file(path=file_data_path)}
+        fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
         if not asyncio.iscoroutinefunction(fn):
             fn(**fn_kwargs)
         elif semaphore := self.context.semaphore:

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -40,6 +40,8 @@ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
 from .milvus import milvus_destination_entry
 from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
 from .mongodb import mongodb_destination_entry, mongodb_source_entry
+from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
+from .neo4j import neo4j_destination_entry
 from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
 from .onedrive import onedrive_destination_entry, onedrive_source_entry
 from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -74,6 +76,7 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
 add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
 add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
+add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
 add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)

unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl