PyPI - unstructured-ingest - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

unstructured-ingest 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (24) hide show

unstructured_ingest/embed/octoai.py CHANGED Viewed

@@ -4,13 +4,11 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import (
-    EMBEDDINGS_KEY,
     AsyncBaseEmbeddingEncoder,
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
 from unstructured_ingest.logger import logger
-from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.errors import (
     ProviderError,
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.errors import (
     RateLimitError,
     UserAuthError,
     UserError,
+    is_internal_error,
 )
 if TYPE_CHECKING:
@@ -30,6 +29,8 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
     base_url: str = Field(default="https://text.octoai.run/v1")
     def wrap_error(self, e: Exception) -> Exception:
+        if is_internal_error(e=e):
+            return e
         # https://platform.openai.com/docs/guides/error-codes/api-errors
         from openai import APIStatusError
@@ -81,31 +82,17 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    def embed_query(self, query: str):
-        try:
-            client = self.config.get_client()
-            response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
-        except Exception as e:
-            raise self.wrap_error(e=e)
+    def _embed_query(self, query: str):
+        client = self.get_client()
+        response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
         return response.data[0].embedding
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        texts = [e["text"] for e in elements_with_text]
-        embeddings = []
-        client = self.config.get_client()
-        try:
-            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-                response = client.embeddings.create(
-                    input=batch, model=self.config.embedder_model_name
-                )
-                embeddings.extend([data.embedding for data in response.data])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
+    def get_client(self) -> "OpenAI":
+        return self.config.get_client()
+    def embed_batch(self, client: "OpenAI", batch: list[str]) -> list[list[float]]:
+        response = client.embeddings.create(input=batch, model=self.config.embedder_model_name)
+        return [data.embedding for data in response.data]
 @dataclass
@@ -115,30 +102,11 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    async def embed_query(self, query: str):
-        client = self.config.get_async_client()
-        try:
-            response = await client.embeddings.create(
-                input=query, model=self.config.embedder_model_name
-            )
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return response.data[0].embedding
+    def get_client(self) -> "AsyncOpenAI":
+        return self.config.get_async_client()
-    async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        texts = [e["text"] for e in elements_with_text]
-        client = self.config.get_async_client()
-        embeddings = []
-        try:
-            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-                response = await client.embeddings.create(
-                    input=batch, model=self.config.embedder_model_name
-                )
-                embeddings.extend([data.embedding for data in response.data])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
+    async def embed_batch(self, client: "AsyncOpenAI", batch: list[str]) -> list[list[float]]:
+        response = await client.embeddings.create(
+            input=batch, model=self.config.embedder_model_name
+        )
+        return [data.embedding for data in response.data]

unstructured_ingest/embed/openai.py CHANGED Viewed

@@ -4,13 +4,11 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import (
-    EMBEDDINGS_KEY,
     AsyncBaseEmbeddingEncoder,
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
 from unstructured_ingest.logger import logger
-from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.errors import (
     ProviderError,
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.errors import (
     RateLimitError,
     UserAuthError,
     UserError,
+    is_internal_error,
 )
 if TYPE_CHECKING:
@@ -29,6 +28,8 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
     embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
     def wrap_error(self, e: Exception) -> Exception:
+        if is_internal_error(e=e):
+            return e
         # https://platform.openai.com/docs/guides/error-codes/api-errors
         from openai import APIStatusError
@@ -72,32 +73,12 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    def embed_query(self, query: str) -> list[float]:
-        client = self.config.get_client()
-        try:
-            response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return response.data[0].embedding
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        client = self.config.get_client()
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        texts = [e["text"] for e in elements_with_text]
-        embeddings = []
-        try:
-            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-                response = client.embeddings.create(
-                    input=batch, model=self.config.embedder_model_name
-                )
-                embeddings.extend([data.embedding for data in response.data])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
+    def get_client(self) -> "OpenAI":
+        return self.config.get_client()
+    def embed_batch(self, client: "OpenAI", batch: list[str]) -> list[list[float]]:
+        response = client.embeddings.create(input=batch, model=self.config.embedder_model_name)
+        return [data.embedding for data in response.data]
 @dataclass
@@ -107,30 +88,11 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    async def embed_query(self, query: str) -> list[float]:
-        client = self.config.get_async_client()
-        try:
-            response = await client.embeddings.create(
-                input=query, model=self.config.embedder_model_name
-            )
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return response.data[0].embedding
-    async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        client = self.config.get_async_client()
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        texts = [e["text"] for e in elements_with_text]
-        embeddings = []
-        try:
-            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-                response = await client.embeddings.create(
-                    input=batch, model=self.config.embedder_model_name
-                )
-                embeddings.extend([data.embedding for data in response.data])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
+    def get_client(self) -> "AsyncOpenAI":
+        return self.config.get_async_client()
+    async def embed_batch(self, client: "AsyncOpenAI", batch: list[str]) -> list[list[float]]:
+        response = await client.embeddings.create(
+            input=batch, model=self.config.embedder_model_name
+        )
+        return [data.embedding for data in response.data]

unstructured_ingest/embed/togetherai.py CHANGED Viewed

@@ -1,24 +1,19 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import (
-    EMBEDDINGS_KEY,
     AsyncBaseEmbeddingEncoder,
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
 from unstructured_ingest.logger import logger
-from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.errors import (
     RateLimitError as CustomRateLimitError,
 )
-from unstructured_ingest.v2.errors import (
-    UserAuthError,
-    UserError,
-)
+from unstructured_ingest.v2.errors import UserAuthError, UserError, is_internal_error
 if TYPE_CHECKING:
     from together import AsyncTogether, Together
@@ -31,6 +26,8 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
     )
     def wrap_error(self, e: Exception) -> Exception:
+        if is_internal_error(e=e):
+            return e
         # https://docs.together.ai/docs/error-codes
         from together.error import AuthenticationError, RateLimitError, TogetherException
@@ -64,31 +61,12 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    def embed_query(self, query: str) -> list[float]:
-        return self._embed_documents(elements=[query])[0]
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = self._embed_documents([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
-    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        client = self.config.get_client()
-        embeddings = []
-        try:
-            for batch in batch_generator(
-                elements, batch_size=self.config.batch_size or len(elements)
-            ):
-                outputs = client.embeddings.create(
-                    model=self.config.embedder_model_name, input=batch
-                )
-                embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return embeddings
+    def get_client(self) -> "Together":
+        return self.config.get_client()
+    def embed_batch(self, client: "Together", batch: list[str]) -> list[list[float]]:
+        outputs = client.embeddings.create(model=self.config.embedder_model_name, input=batch)
+        return [outputs.data[i].embedding for i in range(len(batch))]
 @dataclass
@@ -98,29 +76,9 @@ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    async def embed_query(self, query: str) -> list[float]:
-        embedding = await self._embed_documents(elements=[query])
-        return embedding[0]
-    async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
-    async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        client = self.config.get_async_client()
-        embeddings = []
-        try:
-            for batch in batch_generator(
-                elements, batch_size=self.config.batch_size or len(elements)
-            ):
-                outputs = await client.embeddings.create(
-                    model=self.config.embedder_model_name, input=batch
-                )
-                embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return embeddings
+    def get_client(self) -> "AsyncTogether":
+        return self.config.get_async_client()
+    async def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
+        outputs = await client.embeddings.create(model=self.config.embedder_model_name, input=batch)
+        return [outputs.data[i].embedding for i in range(len(batch))]

unstructured_ingest/embed/vertexai.py CHANGED Viewed

@@ -9,14 +9,12 @@ from pydantic import Field, Secret, ValidationError
 from pydantic.functional_validators import BeforeValidator
 from unstructured_ingest.embed.interfaces import (
-    EMBEDDINGS_KEY,
     AsyncBaseEmbeddingEncoder,
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
-from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.errors import UserAuthError
+from unstructured_ingest.v2.errors import UserAuthError, is_internal_error
 if TYPE_CHECKING:
     from vertexai.language_models import TextEmbeddingModel
@@ -40,6 +38,8 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
     )
     def wrap_error(self, e: Exception) -> Exception:
+        if is_internal_error(e=e):
+            return e
         from google.auth.exceptions import GoogleAuthError
         if isinstance(e, GoogleAuthError):
@@ -72,34 +72,19 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    def embed_query(self, query):
-        return self._embed_documents(elements=[query])[0]
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = self._embed_documents([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
+    def get_client(self) -> "TextEmbeddingModel":
+        return self.config.get_client()
     @requires_dependencies(
         ["vertexai"],
         extras="embed-vertexai",
     )
-    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+    def embed_batch(self, client: "TextEmbeddingModel", batch: list[str]) -> list[list[float]]:
         from vertexai.language_models import TextEmbeddingInput
-        inputs = [TextEmbeddingInput(text=element) for element in elements]
-        client = self.config.get_client()
-        embeddings = []
-        try:
-            for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
-                response = client.get_embeddings(batch)
-                embeddings.extend([e.values for e in response])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return embeddings
+        inputs = [TextEmbeddingInput(text=text) for text in batch]
+        response = client.get_embeddings(inputs)
+        return [e.values for e in response]
 @dataclass
@@ -109,32 +94,16 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    async def embed_query(self, query):
-        embedding = await self._embed_documents(elements=[query])
-        return embedding[0]
-    async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
+    def get_client(self) -> "TextEmbeddingModel":
+        return self.config.get_client()
     @requires_dependencies(
         ["vertexai"],
         extras="embed-vertexai",
     )
-    async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+    async def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
         from vertexai.language_models import TextEmbeddingInput
-        inputs = [TextEmbeddingInput(text=element) for element in elements]
-        client = self.config.get_client()
-        embeddings = []
-        try:
-            for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
-                response = await client.get_embeddings_async(batch)
-                embeddings.extend([e.values for e in response])
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return embeddings
+        inputs = [TextEmbeddingInput(text=text) for text in batch]
+        response = await client.get_embeddings_async(inputs)
+        return [e.values for e in response]

unstructured_ingest/embed/voyageai.py CHANGED Viewed

@@ -4,19 +4,13 @@ from typing import TYPE_CHECKING, Optional
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import (
-    EMBEDDINGS_KEY,
     AsyncBaseEmbeddingEncoder,
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
 from unstructured_ingest.logger import logger
-from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.errors import (
-    ProviderError,
-    UserAuthError,
-    UserError,
-)
+from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError, is_internal_error
 from unstructured_ingest.v2.errors import (
     RateLimitError as CustomRateLimitError,
 )
@@ -39,6 +33,8 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
     timeout_in_seconds: Optional[int] = None
     def wrap_error(self, e: Exception) -> Exception:
+        if is_internal_error(e=e):
+            return e
         # https://docs.voyageai.com/docs/error-codes
         from voyageai.error import AuthenticationError, RateLimitError, VoyageError
@@ -96,27 +92,12 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        client = self.config.get_client()
-        embeddings = []
-        try:
-            for batch in batch_generator(elements, batch_size=self.config.batch_size):
-                response = client.embed(texts=batch, model=self.config.embedder_model_name)
-                embeddings.extend(response.embeddings)
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return embeddings
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = self._embed_documents([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
-    def embed_query(self, query: str) -> list[float]:
-        return self._embed_documents(elements=[query])[0]
+    def get_client(self) -> "VoyageAIClient":
+        return self.config.get_client()
+    def embed_batch(self, client: "VoyageAIClient", batch: list[str]) -> list[list[float]]:
+        response = client.embed(texts=batch, model=self.config.embedder_model_name)
+        return response.embeddings
 @dataclass
@@ -126,27 +107,11 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     def wrap_error(self, e: Exception) -> Exception:
         return self.config.wrap_error(e=e)
-    async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        client = self.config.get_async_client()
-        embeddings = []
-        try:
-            for batch in batch_generator(
-                elements, batch_size=self.config.batch_size or len(elements)
-            ):
-                response = await client.embed(texts=batch, model=self.config.embedder_model_name)
-                embeddings.extend(response.embeddings)
-        except Exception as e:
-            raise self.wrap_error(e=e)
-        return embeddings
-    async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
-    async def embed_query(self, query: str) -> list[float]:
-        embedding = await self._embed_documents(elements=[query])
-        return embedding[0]
+    def get_client(self) -> "AsyncVoyageAIClient":
+        return self.config.get_async_client()
+    async def embed_batch(
+        self, client: "AsyncVoyageAIClient", batch: list[str]
+    ) -> list[list[float]]:
+        response = await client.embed(texts=batch, model=self.config.embedder_model_name)
+        return response.embeddings

unstructured_ingest/v2/errors.py CHANGED Viewed

@@ -16,3 +16,10 @@ class QuotaError(UserError):
 class ProviderError(Exception):
     pass
+recognized_errors = [UserError, UserAuthError, RateLimitError, QuotaError, ProviderError]
+def is_internal_error(e: Exception) -> bool:
+    return any(isinstance(e, recognized_error) for recognized_error in recognized_errors)

unstructured-ingest 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl