PyPI - unstructured-ingest - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

unstructured-ingest 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (25) hide show

test/integration/connectors/test_google_drive.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
+import uuid
 import pytest
+from googleapiclient.errors import HttpError
 from test.integration.connectors.utils.constants import (
     SOURCE_TAG,
@@ -13,6 +15,9 @@ from test.integration.connectors.utils.validation.source import (
     update_fixtures,
 )
 from test.integration.utils import requires_env
+from unstructured_ingest.error import (
+    SourceConnectionError,
+)
 from unstructured_ingest.v2.interfaces import Downloader, Indexer
 from unstructured_ingest.v2.processes.connectors.google_drive import (
     CONNECTOR_TYPE,
@@ -25,6 +30,49 @@ from unstructured_ingest.v2.processes.connectors.google_drive import (
 )
+@pytest.fixture
+def google_drive_connection_config():
+    """
+    Build a valid GoogleDriveConnectionConfig using the environment variables.
+    Expects:
+      - GOOGLE_DRIVE_ID
+      - GOOGLE_DRIVE_SERVICE_KEY
+    """
+    drive_id = os.getenv("GOOGLE_DRIVE_ID")
+    service_key = os.getenv("GOOGLE_DRIVE_SERVICE_KEY")
+    if not drive_id or not service_key:
+        pytest.skip("Google Drive credentials not provided in environment variables.")
+    access_config = GoogleDriveAccessConfig(service_account_key=service_key)
+    return GoogleDriveConnectionConfig(drive_id=drive_id, access_config=access_config)
+@pytest.fixture
+def google_drive_empty_folder(google_drive_connection_config):
+    """
+    Creates an empty folder on Google Drive for testing the "empty folder" case.
+    The folder is deleted after the test.
+    """
+    from google.oauth2 import service_account
+    from googleapiclient.discovery import build
+    access_config = google_drive_connection_config.access_config.get_secret_value()
+    creds = service_account.Credentials.from_service_account_info(access_config.service_account_key)
+    service = build("drive", "v3", credentials=creds)
+    # Create an empty folder.
+    file_metadata = {
+        "name": f"utic-empty-folder-{uuid.uuid4()}",
+        "mimeType": "application/vnd.google-apps.folder",
+    }
+    folder = service.files().create(body=file_metadata, fields="id, name").execute()
+    folder_id = folder.get("id")
+    try:
+        yield folder_id
+    finally:
+        service.files().delete(fileId=folder_id).execute()
 @requires_env("GOOGLE_DRIVE_SERVICE_KEY")
 @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
 def test_google_drive_source(temp_dir):
@@ -114,3 +162,96 @@ def source_connector_validation(
             save_downloads=configs.validate_downloaded_files,
             save_filedata=configs.validate_file_data,
         )
+# Precheck fails when the drive ID has an appended parameter (simulate copy-paste error)
+@pytest.mark.tags("google-drive", "precheck")
+@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
+def test_google_drive_precheck_invalid_parameter(google_drive_connection_config):
+    # Append a query parameter as often happens when copying from a URL.
+    invalid_drive_id = google_drive_connection_config.drive_id + "?usp=sharing"
+    connection_config = GoogleDriveConnectionConfig(
+        drive_id=invalid_drive_id,
+        access_config=google_drive_connection_config.access_config,
+    )
+    index_config = GoogleDriveIndexerConfig(recursive=True)
+    indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
+    with pytest.raises(SourceConnectionError) as excinfo:
+        indexer.precheck()
+    assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
+# Precheck fails due to lack of permission (simulate via monkeypatching).
+@pytest.mark.tags("google-drive", "precheck")
+@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
+def test_google_drive_precheck_no_permission(google_drive_connection_config, monkeypatch):
+    index_config = GoogleDriveIndexerConfig(recursive=True)
+    indexer = GoogleDriveIndexer(
+        connection_config=google_drive_connection_config,
+        index_config=index_config,
+    )
+    # Monkeypatch get_root_info to always raise an HTTP 403 error.
+    def fake_get_root_info(files_client, object_id):
+        raise HttpError(
+            resp=type("Response", (), {"status": 403, "reason": "Forbidden"})(),
+            content=b"Forbidden",
+        )
+    monkeypatch.setattr(indexer, "get_root_info", fake_get_root_info)
+    with pytest.raises(SourceConnectionError) as excinfo:
+        indexer.precheck()
+    assert "forbidden" in str(excinfo.value).lower() or "permission" in str(excinfo.value).lower()
+# Precheck fails when the folder is empty.
+# @pytest.mark.tags("google-drive", "precheck")
+# @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
+# def test_google_drive_precheck_empty_folder(
+#     google_drive_connection_config, google_drive_empty_folder
+# ):
+#     # Use the empty folder's ID as the target.
+#     connection_config = GoogleDriveConnectionConfig(
+#         drive_id=google_drive_empty_folder,
+#         access_config=google_drive_connection_config.access_config,
+#     )
+#     index_config = GoogleDriveIndexerConfig(recursive=True)
+#     indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
+#     with pytest.raises(SourceConnectionError) as excinfo:
+#         indexer.precheck()
+#     assert "empty folder" in str(excinfo.value).lower()
+@pytest.mark.tags("google-drive", "count", "integration")
+@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
+def test_google_drive_count_files(google_drive_connection_config):
+    """
+    This test verifies that the count_files_recursively method returns the expected count of files.
+    According to the test credentials, there are 3 files in the root directory and 1 nested file,
+    so the total count should be 4.
+    """
+    # I assumed that we're applying the same extension filter as with other tests
+    # However there's 6 files in total in the test dir
+    extensions_filter = ["pdf", "docx"]
+    with google_drive_connection_config.get_client() as client:
+        count = GoogleDriveIndexer.count_files_recursively(
+            client, google_drive_connection_config.drive_id, extensions_filter
+        )
+    assert count == 4, f"Expected file count of 4, but got {count}"
+# Precheck fails with a completely invalid drive ID.
+@pytest.mark.tags("google-drive", "precheck")
+@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
+def test_google_drive_precheck_invalid_drive_id(google_drive_connection_config):
+    invalid_drive_id = "invalid_drive_id"
+    connection_config = GoogleDriveConnectionConfig(
+        drive_id=invalid_drive_id,
+        access_config=google_drive_connection_config.access_config,
+    )
+    index_config = GoogleDriveIndexerConfig(recursive=True)
+    indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
+    with pytest.raises(SourceConnectionError) as excinfo:
+        indexer.precheck()
+    assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()

test/unit/v2/embedders/test_bedrock.py CHANGED Viewed

@@ -15,7 +15,7 @@ def generate_embedder_config_params() -> dict:
         "region_name": fake.city(),
     }
     if random.random() < 0.5:
-        params["embed_model_name"] = fake.word()
+        params["embedder_model_name"] = fake.word()
     return params

test/unit/v2/embedders/test_huggingface.py CHANGED Viewed

@@ -16,7 +16,7 @@ fake = faker.Faker()
 def generate_embedder_config_params() -> dict:
     params = {}
     if random.random() < 0.5:
-        params["embed_model_name"] = fake.word() if random.random() < 0.5 else None
+        params["embedder_model_name"] = fake.word() if random.random() < 0.5 else None
         params["embedder_model_kwargs"] = (
             generate_random_dictionary(key_type=str, value_type=Any)
             if random.random() < 0.5

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.1" # pragma: no cover
1	+ __version__ = "0.5.3" # pragma: no cover

unstructured_ingest/embed/azure_openai.py CHANGED Viewed

@@ -44,7 +44,13 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
 class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
     config: AzureOpenAIEmbeddingConfig
+    def get_client(self) -> "AzureOpenAI":
+        return self.config.get_client()
 @dataclass
 class AsyncAzureOpenAIEmbeddingEncoder(AsyncOpenAIEmbeddingEncoder):
     config: AzureOpenAIEmbeddingConfig
+    def get_client(self) -> "AsyncAzureOpenAI":
+        return self.config.get_async_client()

unstructured_ingest/embed/bedrock.py CHANGED Viewed

@@ -8,13 +8,20 @@ from typing import TYPE_CHECKING, AsyncIterable
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import (
+    EMBEDDINGS_KEY,
     AsyncBaseEmbeddingEncoder,
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
 from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
+from unstructured_ingest.v2.errors import (
+    ProviderError,
+    RateLimitError,
+    UserAuthError,
+    UserError,
+    is_internal_error,
+)
 if TYPE_CHECKING:
     from botocore.client import BaseClient
@@ -50,9 +57,11 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
     aws_access_key_id: SecretStr
     aws_secret_access_key: SecretStr
     region_name: str = "us-west-2"
-    embed_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
+    embedder_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
     def wrap_error(self, e: Exception) -> Exception:
+        if is_internal_error(e=e):
+            return e
         from botocore.exceptions import ClientError
         if isinstance(e, ClientError):
@@ -121,7 +130,7 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
     def embed_query(self, query: str) -> list[float]:
         """Call out to Bedrock embedding endpoint."""
-        provider = self.config.embed_model_name.split(".")[0]
+        provider = self.config.embedder_model_name.split(".")[0]
         body = conform_query(query=query, provider=provider)
         bedrock_client = self.config.get_client()
@@ -129,7 +138,7 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
         try:
             response = bedrock_client.invoke_model(
                 body=json.dumps(body),
-                modelId=self.config.embed_model_name,
+                modelId=self.config.embedder_model_name,
                 accept="application/json",
                 contentType="application/json",
             )
@@ -145,9 +154,14 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
             return response_body.get("embedding")
     def embed_documents(self, elements: list[dict]) -> list[dict]:
-        embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
-        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
-        return elements_with_embeddings
+        elements = elements.copy()
+        elements_with_text = [e for e in elements if e.get("text")]
+        if not elements_with_text:
+            return elements
+        embeddings = [self.embed_query(query=e["text"]) for e in elements_with_text]
+        for element, embedding in zip(elements_with_text, embeddings):
+            element[EMBEDDINGS_KEY] = embedding
+        return elements
 @dataclass
@@ -159,7 +173,7 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     async def embed_query(self, query: str) -> list[float]:
         """Call out to Bedrock embedding endpoint."""
-        provider = self.config.embed_model_name.split(".")[0]
+        provider = self.config.embedder_model_name.split(".")[0]
         body = conform_query(query=query, provider=provider)
         try:
             async with self.config.get_async_client() as bedrock_client:
@@ -167,7 +181,7 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
                 try:
                     response = await bedrock_client.invoke_model(
                         body=json.dumps(body),
-                        modelId=self.config.embed_model_name,
+                        modelId=self.config.embedder_model_name,
                         accept="application/json",
                         contentType="application/json",
                     )
@@ -186,8 +200,11 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
             raise ValueError(f"Error raised by inference endpoint: {e}")
     async def embed_documents(self, elements: list[dict]) -> list[dict]:
+        elements = elements.copy()
+        elements_with_text = [e for e in elements if e.get("text")]
         embeddings = await asyncio.gather(
-            *[self.embed_query(query=e.get("text", "")) for e in elements]
+            *[self.embed_query(query=e.get("text", "")) for e in elements_with_text]
         )
-        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
-        return elements_with_embeddings
+        for element, embedding in zip(elements_with_text, embeddings):
+            element[EMBEDDINGS_KEY] = embedding
+        return elements

unstructured_ingest/embed/huggingface.py CHANGED Viewed

@@ -3,7 +3,11 @@ from typing import TYPE_CHECKING, Optional
 from pydantic import Field
-from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.embed.interfaces import (
+    EMBEDDINGS_KEY,
+    BaseEmbeddingEncoder,
+    EmbeddingConfig,
+)
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
@@ -43,7 +47,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
 class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
     config: HuggingFaceEmbeddingConfig
-    def embed_query(self, query: str) -> list[float]:
+    def _embed_query(self, query: str) -> list[float]:
         return self._embed_documents(texts=[query])[0]
     def _embed_documents(self, texts: list[str]) -> list[list[float]]:
@@ -52,6 +56,11 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
         return embeddings.tolist()
     def embed_documents(self, elements: list[dict]) -> list[dict]:
-        embeddings = self._embed_documents([e.get("text", "") for e in elements])
-        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
-        return elements_with_embeddings
+        elements = elements.copy()
+        elements_with_text = [e for e in elements if e.get("text")]
+        if not elements_with_text:
+            return elements
+        embeddings = self._embed_documents([e["text"] for e in elements_with_text])
+        for element, embedding in zip(elements_with_text, embeddings):
+            element[EMBEDDINGS_KEY] = embedding
+        return elements

unstructured_ingest/embed/interfaces.py CHANGED Viewed

@@ -1,11 +1,14 @@
-import asyncio
-from abc import ABC, abstractmethod
+from abc import ABC
 from dataclasses import dataclass
-from typing import Optional
+from typing import Any, Optional
 import numpy as np
 from pydantic import BaseModel, Field
+from unstructured_ingest.utils.data_prep import batch_generator
+EMBEDDINGS_KEY = "embeddings"
 class EmbeddingConfig(BaseModel):
     batch_size: Optional[int] = Field(
@@ -26,27 +29,6 @@ class BaseEncoder(ABC):
         if possible"""
         return e
-    @staticmethod
-    def _add_embeddings_to_elements(
-        elements: list[dict], embeddings: list[list[float]]
-    ) -> list[dict]:
-        """
-        Add embeddings to elements.
-        Args:
-            elements (list[Element]): List of elements.
-            embeddings (list[list[float]]): List of embeddings.
-        Returns:
-            list[Element]: Elements with embeddings added.
-        """
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements
 @dataclass
 class BaseEmbeddingEncoder(BaseEncoder, ABC):
@@ -69,21 +51,37 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    @abstractmethod
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        pass
+    def get_client(self):
+        raise NotImplementedError
-    @abstractmethod
-    def embed_query(self, query: str) -> list[float]:
-        pass
+    def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
+        raise NotImplementedError
-    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        results = []
-        for text in elements:
-            response = self.embed_query(query=text)
-            results.append(response)
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        client = self.get_client()
+        elements = elements.copy()
+        elements_with_text = [e for e in elements if e.get("text")]
+        texts = [e["text"] for e in elements_with_text]
+        embeddings = []
+        try:
+            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
+                embeddings = self.embed_batch(client=client, batch=batch)
+                embeddings.extend(embeddings)
+        except Exception as e:
+            raise self.wrap_error(e=e)
+        for element, embedding in zip(elements_with_text, embeddings):
+            element[EMBEDDINGS_KEY] = embedding
+        return elements
-        return results
+    def _embed_query(self, query: str) -> list[float]:
+        client = self.get_client()
+        return self.embed_batch(client=client, batch=[query])[0]
+    def embed_query(self, query: str) -> list[float]:
+        try:
+            return self._embed_query(query=query)
+        except Exception as e:
+            raise self.wrap_error(e=e)
 @dataclass
@@ -107,14 +105,35 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
         exemplary_embedding = await self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    @abstractmethod
+    def get_client(self):
+        raise NotImplementedError
+    async def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
+        raise NotImplementedError
     async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        pass
+        client = self.get_client()
+        elements = elements.copy()
+        elements_with_text = [e for e in elements if e.get("text")]
+        texts = [e["text"] for e in elements_with_text]
+        embeddings = []
+        try:
+            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
+                embeddings = await self.embed_batch(client=client, batch=batch)
+                embeddings.extend(embeddings)
+        except Exception as e:
+            raise self.wrap_error(e=e)
+        for element, embedding in zip(elements_with_text, embeddings):
+            element[EMBEDDINGS_KEY] = embedding
+        return elements
-    @abstractmethod
-    async def embed_query(self, query: str) -> list[float]:
-        pass
+    async def _embed_query(self, query: str) -> list[float]:
+        client = self.get_client()
+        embeddings = await self.embed_batch(client=client, batch=[query])
+        return embeddings[0]
-    async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        results = await asyncio.gather(*[self.embed_query(query=text) for text in elements])
-        return results
+    async def embed_query(self, query: str) -> list[float]:
+        try:
+            return await self._embed_query(query=query)
+        except Exception as e:
+            raise self.wrap_error(e=e)

unstructured_ingest/embed/mixedbreadai.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
@@ -10,7 +9,6 @@ from unstructured_ingest.embed.interfaces import (
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
-from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 USER_AGENT = "@mixedbread-ai/unstructured"
@@ -84,7 +82,7 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def get_exemplary_embedding(self) -> list[float]:
         """Get an exemplary embedding to determine dimensions and unit vector status."""
-        return self._embed(["Q"])[0]
+        return self.embed_query(query="Q")
     @requires_dependencies(
         ["mixedbread_ai"],
@@ -99,55 +97,19 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
             additional_headers={"User-Agent": USER_AGENT},
         )
-    def _embed(self, texts: list[str]) -> list[list[float]]:
-        """
-        Embed a list of texts using the Mixedbread AI API.
-        Args:
-            texts (list[str]): List of texts to embed.
-        Returns:
-            list[list[float]]: List of embeddings.
-        """
-        responses = []
-        client = self.config.get_client()
-        for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-            response = client.embeddings(
-                model=self.config.embedder_model_name,
-                normalized=True,
-                encoding_format=ENCODING_FORMAT,
-                truncation_strategy=TRUNCATION_STRATEGY,
-                request_options=self.get_request_options(),
-                input=batch,
-            )
-            responses.append(response)
-        return [item.embedding for response in responses for item in response.data]
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        """
-        Embed a list of document elements.
-        Args:
-            elements (list[Element]): List of document elements.
-        Returns:
-            list[Element]: Elements with embeddings.
-        """
-        embeddings = self._embed([e.get("text", "") for e in elements])
-        return self._add_embeddings_to_elements(elements, embeddings)
-    def embed_query(self, query: str) -> list[float]:
-        """
-        Embed a query string.
-        Args:
-            query (str): Query string to embed.
-        Returns:
-            list[float]: Embedding of the query.
-        """
-        return self._embed([query])[0]
+    def get_client(self) -> "MixedbreadAI":
+        return self.config.get_client()
+    def embed_batch(self, client: "MixedbreadAI", batch: list[str]) -> list[list[float]]:
+        response = client.embeddings(
+            model=self.config.embedder_model_name,
+            normalized=True,
+            encoding_format=ENCODING_FORMAT,
+            truncation_strategy=TRUNCATION_STRATEGY,
+            request_options=self.get_request_options(),
+            input=batch,
+        )
+        return [datum.embedding for datum in response.data]
 @dataclass
@@ -157,8 +119,7 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     async def get_exemplary_embedding(self) -> list[float]:
         """Get an exemplary embedding to determine dimensions and unit vector status."""
-        embedding = await self._embed(["Q"])
-        return embedding[0]
+        return await self.embed_query(query="Q")
     @requires_dependencies(
         ["mixedbread_ai"],
@@ -173,54 +134,16 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
             additional_headers={"User-Agent": USER_AGENT},
         )
-    async def _embed(self, texts: list[str]) -> list[list[float]]:
-        """
-        Embed a list of texts using the Mixedbread AI API.
-        Args:
-            texts (list[str]): List of texts to embed.
-        Returns:
-            list[list[float]]: List of embeddings.
-        """
-        client = self.config.get_async_client()
-        tasks = []
-        for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-            tasks.append(
-                client.embeddings(
-                    model=self.config.embedder_model_name,
-                    normalized=True,
-                    encoding_format=ENCODING_FORMAT,
-                    truncation_strategy=TRUNCATION_STRATEGY,
-                    request_options=self.get_request_options(),
-                    input=batch,
-                )
-            )
-        responses = await asyncio.gather(*tasks)
-        return [item.embedding for response in responses for item in response.data]
-    async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        """
-        Embed a list of document elements.
-        Args:
-            elements (list[Element]): List of document elements.
-        Returns:
-            list[Element]: Elements with embeddings.
-        """
-        embeddings = await self._embed([e.get("text", "") for e in elements])
-        return self._add_embeddings_to_elements(elements, embeddings)
-    async def embed_query(self, query: str) -> list[float]:
-        """
-        Embed a query string.
-        Args:
-            query (str): Query string to embed.
-        Returns:
-            list[float]: Embedding of the query.
-        """
-        embedding = await self._embed([query])
-        return embedding[0]
+    def get_client(self) -> "AsyncMixedbreadAI":
+        return self.config.get_async_client()
+    async def embed_batch(self, client: "AsyncMixedbreadAI", batch: list[str]) -> list[list[float]]:
+        response = await client.embeddings(
+            model=self.config.embedder_model_name,
+            normalized=True,
+            encoding_format=ENCODING_FORMAT,
+            truncation_strategy=TRUNCATION_STRATEGY,
+            request_options=self.get_request_options(),
+            input=batch,
+        )
+        return [datum.embedding for datum in response.data]

unstructured-ingest 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl