PyPI - unstructured-ingest - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (93) hide show

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pydantic import Field, Secret
 from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -23,11 +24,13 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
 if TYPE_CHECKING:
     from pinecone import Index as PineconeIndex
+    from pinecone import Pinecone
 CONNECTOR_TYPE = "pinecone"
 MAX_PAYLOAD_SIZE = 2 * 1024 * 1024  # 2MB
 MAX_POOL_THREADS = 100
+MAX_METADATA_BYTES = 40960  # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
 class PineconeAccessConfig(AccessConfig):
@@ -43,16 +46,19 @@ class PineconeConnectionConfig(ConnectionConfig):
     )
     @requires_dependencies(["pinecone"], extras="pinecone")
-    def get_index(self, **index_kwargs) -> "PineconeIndex":
+    def get_client(self, **index_kwargs) -> "Pinecone":
         from pinecone import Pinecone
         from unstructured_ingest import __version__ as unstructured_version
-        pc = Pinecone(
+        return Pinecone(
             api_key=self.access_config.get_secret_value().pinecone_api_key,
             source_tag=f"unstructured_ingest=={unstructured_version}",
         )
+    def get_index(self, **index_kwargs) -> "PineconeIndex":
+        pc = self.get_client()
         index = pc.Index(name=self.index_name, **index_kwargs)
         logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
         return index
@@ -98,6 +104,10 @@ class PineconeUploaderConfig(UploaderConfig):
         default=None,
         description="The namespace to write to. If not specified, the default namespace is used",
     )
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
@@ -106,7 +116,7 @@ class PineconeUploadStager(UploadStager):
         default_factory=lambda: PineconeUploadStagerConfig()
     )
-    def conform_dict(self, element_dict: dict) -> dict:
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         embeddings = element_dict.pop("embeddings", None)
         metadata: dict[str, Any] = element_dict.pop("metadata", {})
         data_source = metadata.pop("data_source", {})
@@ -121,19 +131,30 @@ class PineconeUploadStager(UploadStager):
                 }
             )
+        metadata = flatten_dict(
+            pinecone_metadata,
+            separator="-",
+            flatten_lists=True,
+            remove_none=True,
+        )
+        metadata[RECORD_ID_LABEL] = file_data.identifier
+        metadata_size_bytes = len(json.dumps(metadata).encode())
+        if metadata_size_bytes > MAX_METADATA_BYTES:
+            logger.info(
+                f"Metadata size is {metadata_size_bytes} bytes, which exceeds the limit of"
+                f" {MAX_METADATA_BYTES} bytes per vector. Dropping the metadata."
+            )
+            metadata = {}
         return {
             "id": str(uuid.uuid4()),
             "values": embeddings,
-            "metadata": flatten_dict(
-                pinecone_metadata,
-                separator="-",
-                flatten_lists=True,
-                remove_none=True,
-            ),
+            "metadata": metadata,
         }
     def run(
         self,
+        file_data: FileData,
         elements_filepath: Path,
         output_dir: Path,
         output_filename: str,
@@ -143,10 +164,15 @@ class PineconeUploadStager(UploadStager):
             elements_contents = json.load(elements_file)
         conformed_elements = [
-            self.conform_dict(element_dict=element) for element in elements_contents
+            self.conform_dict(element_dict=element, file_data=file_data)
+            for element in elements_contents
         ]
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
+        if Path(output_filename).suffix != ".json":
+            output_filename = f"{output_filename}.json"
+        else:
+            output_filename = f"{Path(output_filename).stem}.json"
+        output_path = Path(output_dir) / Path(f"{output_filename}")
         output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:
@@ -167,6 +193,61 @@ class PineconeUploader(Uploader):
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def pod_delete_by_record_id(self, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone pod index"
+        )
+        index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
+        delete_kwargs = {
+            "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
+        }
+        if namespace := self.upload_config.namespace:
+            delete_kwargs["namespace"] = namespace
+        resp = index.delete(**delete_kwargs)
+        logger.debug(
+            f"deleted any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone index: {resp}"
+        )
+    def serverless_delete_by_record_id(self, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone serverless index"
+        )
+        index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
+        index_stats = index.describe_index_stats()
+        total_vectors = index_stats["total_vector_count"]
+        if total_vectors == 0:
+            return
+        dimension = index_stats["dimension"]
+        query_params = {
+            "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
+            "vector": [0] * dimension,
+            "top_k": total_vectors,
+        }
+        if namespace := self.upload_config.namespace:
+            query_params["namespace"] = namespace
+        while True:
+            query_results = index.query(**query_params)
+            matches = query_results.get("matches", [])
+            if not matches:
+                break
+            ids = [match["id"] for match in matches]
+            delete_params = {"ids": ids}
+            if namespace := self.upload_config.namespace:
+                delete_params["namespace"] = namespace
+            index.delete(**delete_params)
+        logger.debug(
+            f"deleted any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone index"
+        )
     @requires_dependencies(["pinecone"], extras="pinecone")
     def upsert_batches_async(self, elements_dict: list[dict]):
         from pinecone.exceptions import PineconeApiException
@@ -206,9 +287,16 @@ class PineconeUploader(Uploader):
             f"writing a total of {len(elements_dict)} elements via"
             f" document batches to destination"
             f" index named {self.connection_config.index_name}"
-            f" with batch size {self.upload_config.batch_size}"
         )
+        # Determine if serverless or pod based index
+        pinecone_client = self.connection_config.get_client()
+        index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
+        if "serverless" in index_description.get("spec"):
+            self.serverless_delete_by_record_id(file_data=file_data)
+        elif "pod" in index_description.get("spec"):
+            self.pod_delete_by_record_id(file_data=file_data)
+        else:
+            raise ValueError(f"unexpected spec type in index description: {index_description}")
         self.upsert_batches_async(elements_dict=elements_dict)

unstructured_ingest/v2/processes/connectors/sharepoint.py CHANGED Viewed

@@ -21,7 +21,6 @@ from unstructured_ingest.v2.interfaces import (
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
@@ -426,7 +425,7 @@ class SharepointDownloader(Downloader):
             f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
         return self.generate_download_response(file_data=file_data, download_path=download_path)
-    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         content_type = file_data.additional_metadata.get("sharepoint_content_type")
         if not content_type:
             raise ValueError(
@@ -436,6 +435,8 @@ class SharepointDownloader(Downloader):
             return self.get_document(file_data=file_data)
         elif content_type == SharepointContentType.SITEPAGE.value:
             return self.get_site_page(file_data=file_data)
+        else:
+            raise ValueError(f"content type not recognized: {content_type}")
 sharepoint_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/slack.py CHANGED Viewed

@@ -16,9 +16,9 @@ from unstructured_ingest.v2.interfaces import (
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
+    DownloadResponse,
     Indexer,
     IndexerConfig,
-    download_responses,
 )
 from unstructured_ingest.v2.interfaces.file_data import (
     FileData,
@@ -161,7 +161,7 @@ class SlackDownloader(Downloader):
     def run(self, file_data, **kwargs):
         raise NotImplementedError
-    async def run_async(self, file_data: FileData, **kwargs) -> download_responses:
+    async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
         # NOTE: Indexer should provide source identifiers required to generate the download path
         download_path = self.get_download_path(file_data)
         if download_path is None:

unstructured_ingest/v2/processes/connectors/sql/postgres.py CHANGED Viewed

@@ -98,20 +98,28 @@ class PostgresDownloader(SQLDownloader):
     download_config: PostgresDownloaderConfig
     connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["psycopg2"], extras="postgres")
     def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
+        from psycopg2 import sql
         table_name = file_data.additional_metadata["table_name"]
         id_column = file_data.additional_metadata["id_column"]
-        ids = file_data.additional_metadata["ids"]
+        ids = tuple(file_data.additional_metadata["ids"])
         with self.connection_config.get_cursor() as cursor:
-            fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
-            query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
+            fields = (
+                sql.SQL(",").join(sql.Identifier(field) for field in self.download_config.fields)
+                if self.download_config.fields
+                else sql.SQL("*")
+            )
+            query = sql.SQL("SELECT {fields} FROM {table_name} WHERE {id_column} IN %s").format(
                 fields=fields,
-                table_name=table_name,
-                id_column=id_column,
-                ids=",".join([str(i) for i in ids]),
+                table_name=sql.Identifier(table_name),
+                id_column=sql.Identifier(id_column),
             )
-            logger.debug(f"running query: {query}")
-            cursor.execute(query)
+            logger.debug(f"running query: {cursor.mogrify(query, (ids,))}")
+            cursor.execute(query, (ids,))
             rows = cursor.fetchall()
             columns = [col[0] for col in cursor.description]
             return rows, columns

unstructured_ingest/v2/processes/connectors/sql/sql.py CHANGED Viewed

@@ -16,6 +16,8 @@ from dateutil import parser
 from pydantic import Field, Secret
 from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
+from unstructured_ingest.utils.data_prep import split_dataframe
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -236,35 +238,25 @@ class SQLUploadStagerConfig(UploadStagerConfig):
 class SQLUploadStager(UploadStager):
     upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
-    def run(
-        self,
-        elements_filepath: Path,
-        file_data: FileData,
-        output_dir: Path,
-        output_filename: str,
-        **kwargs: Any,
-    ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents: list[dict] = json.load(elements_file)
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
-        output_path.parent.mkdir(parents=True, exist_ok=True)
+    @staticmethod
+    def conform_dict(data: dict, file_data: FileData) -> pd.DataFrame:
+        working_data = data.copy()
         output = []
-        for data in elements_contents:
-            metadata: dict[str, Any] = data.pop("metadata", {})
+        for element in working_data:
+            metadata: dict[str, Any] = element.pop("metadata", {})
             data_source = metadata.pop("data_source", {})
             coordinates = metadata.pop("coordinates", {})
-            data.update(metadata)
-            data.update(data_source)
-            data.update(coordinates)
+            element.update(metadata)
+            element.update(data_source)
+            element.update(coordinates)
-            data["id"] = str(uuid.uuid4())
+            element["id"] = str(uuid.uuid4())
             # remove extraneous, not supported columns
-            data = {k: v for k, v in data.items() if k in _COLUMNS}
-            output.append(data)
+            element = {k: v for k, v in element.items() if k in _COLUMNS}
+            element[RECORD_ID_LABEL] = file_data.identifier
+            output.append(element)
         df = pd.DataFrame.from_dict(output)
         for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
@@ -281,6 +273,26 @@ class SQLUploadStager(UploadStager):
             ("version", "page_number", "regex_metadata"),
         ):
             df[column] = df[column].apply(str)
+        return df
+    def run(
+        self,
+        elements_filepath: Path,
+        file_data: FileData,
+        output_dir: Path,
+        output_filename: str,
+        **kwargs: Any,
+    ) -> Path:
+        with open(elements_filepath) as elements_file:
+            elements_contents: list[dict] = json.load(elements_file)
+        df = self.conform_dict(data=elements_contents, file_data=file_data)
+        if Path(output_filename).suffix != ".json":
+            output_filename = f"{output_filename}.json"
+        else:
+            output_filename = f"{Path(output_filename).stem}.json"
+        output_path = Path(output_dir) / Path(f"{output_filename}")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         with output_path.open("w") as output_file:
             df.to_json(output_file, orient="records", lines=True)
@@ -290,6 +302,10 @@ class SQLUploadStager(UploadStager):
 class SQLUploaderConfig(UploaderConfig):
     batch_size: int = Field(default=50, description="Number of records per batch")
     table_name: str = Field(default="elements", description="which table to upload contents to")
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
@@ -323,18 +339,45 @@ class SQLUploader(Uploader):
             output.append(tuple(parsed))
         return output
+    def _fit_to_schema(self, df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
+        columns = set(df.columns)
+        schema_fields = set(columns)
+        columns_to_drop = columns - schema_fields
+        missing_columns = schema_fields - columns
+        if columns_to_drop:
+            logger.warning(
+                "Following columns will be dropped to match the table's schema: "
+                f"{', '.join(columns_to_drop)}"
+            )
+        if missing_columns:
+            logger.info(
+                "Following null filled columns will be added to match the table's schema:"
+                f" {', '.join(missing_columns)} "
+            )
+        df = df.drop(columns=columns_to_drop)
+        for column in missing_columns:
+            df[column] = pd.Series()
     def upload_contents(self, path: Path) -> None:
         df = pd.read_json(path, orient="records", lines=True)
         df.replace({np.nan: None}, inplace=True)
+        self._fit_to_schema(df=df, columns=self.get_table_columns())
         columns = list(df.columns)
         stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) VALUES({','.join([self.values_delimiter for x in columns])})"  # noqa E501
-        for rows in pd.read_json(
-            path, orient="records", lines=True, chunksize=self.upload_config.batch_size
-        ):
+        logger.info(
+            f"writing a total of {len(df)} elements via"
+            f" document batches to destination"
+            f" table named {self.upload_config.table_name}"
+            f" with batch size {self.upload_config.batch_size}"
+        )
+        for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
             with self.connection_config.get_cursor() as cursor:
                 values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
+                # For debugging purposes:
                 # for val in values:
                 #     try:
                 #         cursor.execute(stmt, val)
@@ -343,5 +386,33 @@ class SQLUploader(Uploader):
                 #         print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
                 cursor.executemany(stmt, values)
+    def get_table_columns(self) -> list[str]:
+        with self.connection_config.get_cursor() as cursor:
+            cursor.execute(f"SELECT * from {self.upload_config.table_name}")
+            return [desc[0] for desc in cursor.description]
+    def can_delete(self) -> bool:
+        return self.upload_config.record_id_key in self.get_table_columns()
+    def delete_by_record_id(self, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with data "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from table {self.upload_config.table_name}"
+        )
+        stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}"  # noqa: E501
+        with self.connection_config.get_cursor() as cursor:
+            cursor.execute(stmt, [file_data.identifier])
+            rowcount = cursor.rowcount
+            logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        if self.can_delete():
+            self.delete_by_record_id(file_data=file_data)
+        else:
+            logger.warning(
+                f"table doesn't contain expected "
+                f"record id column "
+                f"{self.upload_config.record_id_key}, skipping delete"
+            )
         self.upload_contents(path=path)

unstructured_ingest/v2/processes/connectors/weaviate/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+from __future__ import annotations
+from unstructured_ingest.v2.processes.connector_registry import (
+    add_destination_entry,
+)
+from .cloud import CONNECTOR_TYPE as CLOUD_WEAVIATE_CONNECTOR_TYPE
+from .cloud import weaviate_cloud_destination_entry
+from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
+from .embedded import weaviate_embedded_destination_entry
+from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
+from .local import weaviate_local_destination_entry
+add_destination_entry(
+    destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
+)
+add_destination_entry(
+    destination_type=CLOUD_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_cloud_destination_entry
+)
+add_destination_entry(
+    destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
+)

unstructured_ingest/v2/processes/connectors/weaviate/cloud.py ADDED Viewed

@@ -0,0 +1,164 @@
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Generator, Optional
+from pydantic import Field, Secret
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
+from unstructured_ingest.v2.processes.connectors.weaviate.weaviate import (
+    WeaviateAccessConfig,
+    WeaviateConnectionConfig,
+    WeaviateUploader,
+    WeaviateUploaderConfig,
+    WeaviateUploadStager,
+    WeaviateUploadStagerConfig,
+)
+if TYPE_CHECKING:
+    from weaviate.auth import AuthCredentials
+    from weaviate.client import WeaviateClient
+CONNECTOR_TYPE = "weaviate-cloud"
+class CloudWeaviateAccessConfig(WeaviateAccessConfig):
+    access_token: Optional[str] = Field(
+        default=None, description="Used to create the bearer token."
+    )
+    api_key: Optional[str] = None
+    client_secret: Optional[str] = None
+    password: Optional[str] = None
+class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
+    cluster_url: str = Field(
+        description="The WCD cluster URL or hostname to connect to. "
+        "Usually in the form: rAnD0mD1g1t5.something.weaviate.cloud"
+    )
+    username: Optional[str] = None
+    anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
+    refresh_token: Optional[str] = Field(
+        default=None,
+        description="Will tie this value to the bearer token. If not provided, "
+        "the authentication will expire once the lifetime of the access token is up.",
+    )
+    access_config: Secret[CloudWeaviateAccessConfig]
+    def model_post_init(self, __context: Any) -> None:
+        if self.anonymous:
+            return
+        access_config = self.access_config.get_secret_value()
+        auths = {
+            "api_key": access_config.api_key is not None,
+            "bearer_token": access_config.access_token is not None,
+            "client_secret": access_config.client_secret is not None,
+            "client_password": access_config.password is not None and self.username is not None,
+        }
+        if len(auths) == 0:
+            raise ValueError("No auth values provided and anonymous is False")
+        if len(auths) > 1:
+            existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
+            raise ValueError(
+                "Multiple auth values provided, only one approach can be used: {}".format(
+                    ", ".join(existing_auths)
+                )
+            )
+    @requires_dependencies(["weaviate"], extras="weaviate")
+    def get_api_key_auth(self) -> Optional["AuthCredentials"]:
+        from weaviate.classes.init import Auth
+        if api_key := self.access_config.get_secret_value().api_key:
+            return Auth.api_key(api_key=api_key)
+        return None
+    @requires_dependencies(["weaviate"], extras="weaviate")
+    def get_bearer_token_auth(self) -> Optional["AuthCredentials"]:
+        from weaviate.classes.init import Auth
+        if access_token := self.access_config.get_secret_value().access_token:
+            return Auth.bearer_token(access_token=access_token, refresh_token=self.refresh_token)
+        return None
+    @requires_dependencies(["weaviate"], extras="weaviate")
+    def get_client_secret_auth(self) -> Optional["AuthCredentials"]:
+        from weaviate.classes.init import Auth
+        if client_secret := self.access_config.get_secret_value().client_secret:
+            return Auth.client_credentials(client_secret=client_secret)
+        return None
+    @requires_dependencies(["weaviate"], extras="weaviate")
+    def get_client_password_auth(self) -> Optional["AuthCredentials"]:
+        from weaviate.classes.init import Auth
+        if (username := self.username) and (
+            password := self.access_config.get_secret_value().password
+        ):
+            return Auth.client_password(username=username, password=password)
+        return None
+    @requires_dependencies(["weaviate"], extras="weaviate")
+    def get_auth(self) -> "AuthCredentials":
+        auths = [
+            self.get_api_key_auth(),
+            self.get_client_secret_auth(),
+            self.get_bearer_token_auth(),
+            self.get_client_password_auth(),
+        ]
+        auths = [auth for auth in auths if auth]
+        if len(auths) == 0:
+            raise ValueError("No auth values provided and anonymous is False")
+        if len(auths) > 1:
+            raise ValueError("Multiple auth values provided, only one approach can be used")
+        return auths[0]
+    @contextmanager
+    @requires_dependencies(["weaviate"], extras="weaviate")
+    def get_client(self) -> Generator["WeaviateClient", None, None]:
+        from weaviate import connect_to_weaviate_cloud
+        from weaviate.classes.init import AdditionalConfig
+        auth_credentials = None if self.anonymous else self.get_auth()
+        with connect_to_weaviate_cloud(
+            cluster_url=self.cluster_url,
+            auth_credentials=auth_credentials,
+            additional_config=AdditionalConfig(timeout=self.get_timeout()),
+        ) as weaviate_client:
+            yield weaviate_client
+class CloudWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
+    pass
+@dataclass
+class CloudWeaviateUploadStager(WeaviateUploadStager):
+    upload_stager_config: CloudWeaviateUploadStagerConfig = field(
+        default_factory=lambda: WeaviateUploadStagerConfig()
+    )
+class CloudWeaviateUploaderConfig(WeaviateUploaderConfig):
+    pass
+@dataclass
+class CloudWeaviateUploader(WeaviateUploader):
+    connection_config: CloudWeaviateConnectionConfig = field(
+        default_factory=lambda: CloudWeaviateConnectionConfig()
+    )
+    upload_config: CloudWeaviateUploaderConfig = field(
+        default_factory=lambda: CloudWeaviateUploaderConfig()
+    )
+    connector_type: str = CONNECTOR_TYPE
+weaviate_cloud_destination_entry = DestinationRegistryEntry(
+    connection_config=CloudWeaviateConnectionConfig,
+    uploader=CloudWeaviateUploader,
+    uploader_config=CloudWeaviateUploaderConfig,
+    upload_stager=CloudWeaviateUploadStager,
+    upload_stager_config=CloudWeaviateUploadStagerConfig,
+)

unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl