PyPI - unstructured-ingest - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl - Mend

unstructured-ingest 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show

unstructured_ingest/v2/processes/connectors/fsspec/azure.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -152,8 +152,8 @@ class AzureUploader(FsspecUploader):
         super().precheck()
     @requires_dependencies(["adlfs", "fsspec"], extras="azure")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        return super().run(contents=contents, **kwargs)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        return super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["adlfs", "fsspec"], extras="azure")
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/fsspec/box.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -118,8 +118,8 @@ class BoxUploader(FsspecUploader):
         super().precheck()
     @requires_dependencies(["boxfs"], extras="box")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        return super().run(contents=contents, **kwargs)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        return super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["boxfs"], extras="box")
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -114,8 +114,8 @@ class DropboxUploader(FsspecUploader):
         super().precheck()
     @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        return super().run(contents=contents, **kwargs)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        return super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -26,7 +26,6 @@ from unstructured_ingest.v2.interfaces import (
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    UploadContent,
     Uploader,
     UploaderConfig,
 )
@@ -273,6 +272,9 @@ class FsspecUploader(Uploader):
     connector_type: str = CONNECTOR_TYPE
     upload_config: FsspecUploaderConfigT = field(default=None)
+    def is_async(self) -> bool:
+        return self.fs.async_impl
     @property
     def fs(self) -> "AbstractFileSystem":
         from fsspec import get_filesystem_class
@@ -311,11 +313,7 @@ class FsspecUploader(Uploader):
         updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
         return updated_upload_path
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        for content in contents:
-            self._run(path=content.path, file_data=content.file_data)
-    def _run(self, path: Path, file_data: FileData) -> None:
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         path_str = str(path.resolve())
         upload_path = self.get_upload_path(file_data=file_data)
         if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:

unstructured_ingest/v2/processes/connectors/fsspec/gcs.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.string_and_date_utils import json_to_dict
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -151,8 +151,8 @@ class GcsUploader(FsspecUploader):
         super().precheck()
     @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        return super().run(contents=contents, **kwargs)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        return super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -12,7 +12,6 @@ from unstructured_ingest.v2.interfaces import (
     DownloadResponse,
     FileData,
     FileDataSourceMetadata,
-    UploadContent,
 )
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
@@ -171,8 +170,8 @@ class S3Uploader(FsspecUploader):
         super().__post_init__()
     @requires_dependencies(["s3fs", "fsspec"], extras="s3")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        return super().run(contents=contents, **kwargs)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        return super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["s3fs", "fsspec"], extras="s3")
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/fsspec/sftp.py CHANGED Viewed

@@ -9,7 +9,7 @@ from urllib.parse import urlparse
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -142,8 +142,8 @@ class SftpUploader(FsspecUploader):
         super().precheck()
     @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        return super().run(contents=contents, **kwargs)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        return super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/kdbai.py CHANGED Viewed

@@ -15,7 +15,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -152,13 +151,13 @@ class KdbaiUploader(Uploader):
         df = pd.DataFrame(data=all_records)
         self.process_dataframe(df=df)
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        csv_paths = [c.path for c in contents if c.path.suffix == ".csv"]
-        if csv_paths:
-            self.process_csv(csv_paths=csv_paths)
-        json_paths = [c.path for c in contents if c.path.suffix == ".json"]
-        if json_paths:
-            self.process_json(json_paths=json_paths)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        if path.suffix == ".csv":
+            self.process_csv(csv_paths=[path])
+        elif path.suffix == ".json":
+            self.process_json(json_paths=[path])
+        else:
+            raise ValueError(f"Unsupported file type, must be json or csv file: {path}")
 kdbai_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/local.py CHANGED Viewed

@@ -18,7 +18,6 @@ from unstructured_ingest.v2.interfaces import (
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    UploadContent,
     Uploader,
     UploaderConfig,
 )
@@ -179,27 +178,21 @@ class LocalUploader(Uploader):
     def is_async(self) -> bool:
         return False
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        self.upload_config.output_path.mkdir(parents=True, exist_ok=True)
-        for content in contents:
-            if source_identifiers := content.file_data.source_identifiers:
-                identifiers = source_identifiers
-                rel_path = (
-                    identifiers.relative_path[1:]
-                    if identifiers.relative_path.startswith("/")
-                    else identifiers.relative_path
-                )
-                new_path = self.upload_config.output_path / Path(rel_path)
-                final_path = str(new_path).replace(
-                    identifiers.filename, f"{identifiers.filename}.json"
-                )
-            else:
-                final_path = self.upload_config.output_path / Path(
-                    f"{content.file_data.identifier}.json"
-                )
-            Path(final_path).parent.mkdir(parents=True, exist_ok=True)
-            logger.debug(f"copying file from {content.path} to {final_path}")
-            shutil.copy(src=str(content.path), dst=str(final_path))
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        if source_identifiers := file_data.source_identifiers:
+            identifiers = source_identifiers
+            rel_path = (
+                identifiers.relative_path[1:]
+                if identifiers.relative_path.startswith("/")
+                else identifiers.relative_path
+            )
+            new_path = self.upload_config.output_path / Path(rel_path)
+            final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
+        else:
+            final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
+        Path(final_path).parent.mkdir(parents=True, exist_ok=True)
+        logger.debug(f"copying file from {path} to {final_path}")
+        shutil.copy(src=str(path), dst=str(final_path))
 local_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/milvus.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import json
-import multiprocessing as mp
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, Union
@@ -135,9 +134,6 @@ class MilvusUploadStager(UploadStager):
 class MilvusUploaderConfig(UploaderConfig):
     collection_name: str = Field(description="Milvus collections to write to")
-    num_processes: int = Field(
-        default=4, description="number of processes to use when writing to support parallel writes"
-    )
 @dataclass
@@ -183,16 +179,8 @@ class MilvusUploader(Uploader):
             data: list[dict] = json.load(file)
         self.insert_results(data=data)
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        if self.upload_config.num_processes == 1:
-            for content in contents:
-                self.upload(content=content)
-        else:
-            with mp.Pool(
-                processes=self.upload_config.num_processes,
-            ) as pool:
-                pool.map(self.upload, contents)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        self.upload(content=UploadContent(path=path, file_data=file_data))
 milvus_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/mongodb.py CHANGED Viewed

@@ -13,7 +13,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -119,13 +118,9 @@ class MongoDBUploader(Uploader):
                 server_api=ServerApi(version=SERVER_API_VERSION),
             )
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        elements_dict = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements = json.load(elements_file)
-                elements_dict.extend(elements)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         logger.info(
             f"writing {len(elements_dict)} objects to destination "
             f"db, {self.connection_config.database}, "

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import json
-import multiprocessing as mp
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -13,7 +12,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
-    UploadContent,
+    FileData,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -68,7 +67,6 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
 class PineconeUploaderConfig(UploaderConfig):
     batch_size: int = Field(default=100, description="Number of records per batch")
-    num_processes: int = Field(default=4, description="Number of processes to use for uploading")
 @dataclass
@@ -143,34 +141,18 @@ class PineconeUploader(Uploader):
             raise DestinationConnectionError(f"http error: {api_error}") from api_error
         logger.debug(f"results: {response}")
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        elements_dict = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements = json.load(elements_file)
-                elements_dict.extend(elements)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         logger.info(
             f"writing document batches to destination"
             f" index named {self.connection_config.index_name}"
             f" with batch size {self.upload_config.batch_size}"
-            f" with {self.upload_config.num_processes} (number of) processes"
         )
         pinecone_batch_size = self.upload_config.batch_size
-        if self.upload_config.num_processes == 1:
-            for batch in batch_generator(elements_dict, pinecone_batch_size):
-                self.upsert_batch(batch)  # noqa: E203
-        else:
-            with mp.Pool(
-                processes=self.upload_config.num_processes,
-            ) as pool:
-                pool.map(
-                    self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
-                )
+        for pinecone_batch in batch_generator(elements_dict, pinecone_batch_size):
+            self.upsert_batch(batch=pinecone_batch)
 pinecone_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/singlestore.py CHANGED Viewed

@@ -16,7 +16,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -120,8 +119,8 @@ class SingleStoreUploader(Uploader):
     upload_config: SingleStoreUploaderConfig
     connector_type: str = CONNECTOR_TYPE
-    def upload_csv(self, content: UploadContent) -> None:
-        df = pd.read_csv(content.path)
+    def upload_csv(self, csv_path: Path) -> None:
+        df = pd.read_csv(csv_path)
         logger.debug(
             f"uploading {len(df)} entries to {self.connection_config.database} "
             f"db in table {self.upload_config.table_name}"
@@ -142,9 +141,10 @@ class SingleStoreUploader(Uploader):
                     cur.executemany(stmt, chunk)
                     conn.commit()
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        for content in contents:
-            self.upload_csv(content=content)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        if path.suffix != ".csv":
+            raise ValueError(f"Only .csv files are supported: {path}")
+        self.upload_csv(csv_path=path)
 singlestore_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/sql.py CHANGED Viewed

@@ -16,7 +16,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -246,8 +245,8 @@ class SQLUploader(Uploader):
             output.append(tuple(parsed))
         return output
-    def upload_contents(self, content: UploadContent) -> None:
-        df = pd.read_json(content.path, orient="records", lines=True)
+    def upload_contents(self, path: Path) -> None:
+        df = pd.read_json(path, orient="records", lines=True)
         logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
         df.replace({np.nan: None}, inplace=True)
@@ -256,7 +255,7 @@ class SQLUploader(Uploader):
                 VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})"  # noqa E501
         for rows in pd.read_json(
-            content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
+            path, orient="records", lines=True, chunksize=self.upload_config.batch_size
         ):
             with self.connection() as conn:
                 values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
@@ -268,9 +267,8 @@ class SQLUploader(Uploader):
                 conn.commit()
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        for content in contents:
-            self.upload_contents(content=content)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        self.upload_contents(path=path)
 sql_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/weaviate.py CHANGED Viewed

@@ -13,7 +13,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -184,7 +183,7 @@ class WeaviateUploader(Uploader):
     @requires_dependencies(["weaviate"], extras="weaviate")
     def _resolve_auth_method(self):
-        access_configs = self.connection_config.access_config
+        access_configs = self.connection_config.access_config.get_secret_value()
         connection_config = self.connection_config
         if connection_config.anonymous:
             return None
@@ -216,15 +215,9 @@ class WeaviateUploader(Uploader):
             )
         return None
-    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
-        # TODO update to use async support in weaviate client
-        #  once the version can be bumped to include it
-        elements_dict = []
-        for content in contents:
-            with open(content.path) as elements_file:
-                elements = json.load(elements_file)
-                elements_dict.extend(elements)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as file:
+            elements_dict = json.load(file)
         logger.info(
             f"writing {len(elements_dict)} objects to destination "
             f"class {self.connection_config.class_name} "

unstructured_ingest/v2/processes/partitioner.py CHANGED Viewed

@@ -13,6 +13,7 @@ from unstructured_ingest.v2.logger import logger
 if TYPE_CHECKING:
     from unstructured_client import UnstructuredClient
+    from unstructured_client.models.operations import PartitionRequest
     from unstructured_client.models.shared import PartitionParameters
@@ -153,7 +154,7 @@ class Partitioner(BaseProcess, ABC):
         )
         return self.postprocess(elements=elements_to_dicts(elements))
-    async def call_api(self, client: "UnstructuredClient", request: "PartitionParameters"):
+    async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
         # TODO when client supports async, run without using run_in_executor
         # isolate the IO heavy call
         loop = asyncio.get_event_loop()
@@ -163,7 +164,14 @@ class Partitioner(BaseProcess, ABC):
         from unstructured_client.models.shared import Files, PartitionParameters
         partition_request = self.config.to_partition_kwargs()
-        possible_fields = [f.name for f in fields(PartitionParameters)]
+        # Note(austin): PartitionParameters is a Pydantic model in v0.26.0
+        # Prior to this it was a dataclass which doesn't have .__fields
+        try:
+            possible_fields = PartitionParameters.__fields__
+        except AttributeError:
+            possible_fields = [f.name for f in fields(PartitionParameters)]
         filtered_partition_request = {
             k: v for k, v in partition_request.items() if k in possible_fields
         }
@@ -189,6 +197,7 @@ class Partitioner(BaseProcess, ABC):
         self, filename: Path, metadata: Optional[dict] = None, **kwargs
     ) -> list[dict]:
         from unstructured_client import UnstructuredClient
+        from unstructured_client.models.operations import PartitionRequest
         logger.debug(f"partitioning file {filename} with metadata: {metadata}")
         client = UnstructuredClient(
@@ -196,7 +205,8 @@ class Partitioner(BaseProcess, ABC):
             api_key_auth=self.config.api_key.get_secret_value(),
         )
         partition_params = self.create_partition_parameters(filename=filename)
-        resp = await self.call_api(client=client, request=partition_params)
+        partition_request = PartitionRequest(partition_params)
+        resp = await self.call_api(client=client, request=partition_request)
         elements = resp.elements or []
         # Append the data source metadata the auto partition does for you
         for element in elements:

unstructured-ingest 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl