PyPI - unstructured-ingest - Versions diffs - 0.0.21__py3-none-any.whl → 0.0.22__py3-none-any.whl - Mend

unstructured-ingest 0.0.21py3-none-any.whl → 0.0.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (41) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.21" # pragma: no cover
1	+ __version__ = "0.0.22" # pragma: no cover

unstructured_ingest/embed/bedrock.py CHANGED Viewed

@@ -1,38 +1,43 @@
+import json
+import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING
 import numpy as np
-from pydantic import SecretStr
+from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from langchain_community.embeddings import BedrockEmbeddings
+    from botocore.client import BaseClient
+    class BedrockClient(BaseClient):
+        def invoke_model(self, body: str, modelId: str, trace: str) -> dict:
+            pass
 class BedrockEmbeddingConfig(EmbeddingConfig):
     aws_access_key_id: SecretStr
     aws_secret_access_key: SecretStr
     region_name: str = "us-west-2"
+    embed_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
     @requires_dependencies(
-        ["boto3", "numpy", "langchain_community"],
+        ["boto3", "numpy", "botocore"],
         extras="bedrock",
     )
-    def get_client(self) -> "BedrockEmbeddings":
+    def get_client(self) -> "BedrockClient":
         # delay import only when needed
         import boto3
-        from langchain_community.embeddings import BedrockEmbeddings
-        bedrock_runtime = boto3.client(
+        bedrock_client = boto3.client(
             service_name="bedrock-runtime",
             aws_access_key_id=self.aws_access_key_id.get_secret_value(),
             aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
             region_name=self.region_name,
         )
-        bedrock_client = BedrockEmbeddings(client=bedrock_runtime)
         return bedrock_client
@@ -40,28 +45,60 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
 class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
     config: BedrockEmbeddingConfig
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query(query="Q")
-    def num_of_dimensions(self):
+    def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.shape(exemplary_embedding)
-    def is_unit_vector(self):
+    def is_unit_vector(self) -> bool:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    def embed_query(self, query):
-        bedrock_client = self.config.get_client()
-        return np.array(bedrock_client.embed_query(query))
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
-        bedrock_client = self.config.get_client()
-        embeddings = bedrock_client.embed_documents([e.get("text", "") for e in elements])
+    def embed_query(self, query: str) -> list[float]:
+        """Call out to Bedrock embedding endpoint."""
+        # replace newlines, which can negatively affect performance.
+        text = query.replace(os.linesep, " ")
+        # format input body for provider
+        provider = self.config.embed_model_name.split(".")[0]
+        input_body = {}
+        if provider == "cohere":
+            if "input_type" not in input_body:
+                input_body["input_type"] = "search_document"
+            input_body["texts"] = [text]
+        else:
+            # includes common provider == "amazon"
+            input_body["inputText"] = text
+        body = json.dumps(input_body)
+        try:
+            bedrock_client = self.config.get_client()
+            # invoke bedrock API
+            response = bedrock_client.invoke_model(
+                body=body,
+                modelId=self.config.embed_model_name,
+                accept="application/json",
+                contentType="application/json",
+            )
+            # format output based on provider
+            response_body = json.loads(response.get("body").read())
+            if provider == "cohere":
+                return response_body.get("embeddings")[0]
+            else:
+                # includes common provider == "amazon"
+                return response_body.get("embedding")
+        except Exception as e:
+            raise ValueError(f"Error raised by inference endpoint: {e}")
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
         for i, element in enumerate(elements):

unstructured_ingest/embed/huggingface.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 import numpy as np
 from pydantic import Field
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+    from sentence_transformers import SentenceTransformer
 class HuggingFaceEmbeddingConfig(EmbeddingConfig):
@@ -19,51 +19,51 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
         default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
     )
     encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
-    cache_folder: Optional[dict] = Field(default=None)
+    cache_folder: Optional[str] = Field(default=None)
     @requires_dependencies(
-        ["langchain_huggingface"],
+        ["sentence_transformers"],
         extras="embed-huggingface",
     )
-    def get_client(self) -> "HuggingFaceEmbeddings":
-        """Creates a langchain Huggingface python client to embed elements."""
-        from langchain_huggingface.embeddings import HuggingFaceEmbeddings
-        client = HuggingFaceEmbeddings(
-            model_name=self.embedder_model_name,
-            model_kwargs=self.embedder_model_kwargs,
-            encode_kwargs=self.encode_kwargs,
+    def get_client(self) -> "SentenceTransformer":
+        from sentence_transformers import SentenceTransformer
+        return SentenceTransformer(
+            model_name_or_path=self.embedder_model_name,
             cache_folder=self.cache_folder,
+            **self.embedder_model_kwargs,
         )
-        return client
 @dataclass
 class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
     config: HuggingFaceEmbeddingConfig
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query(query="Q")
-    def num_of_dimensions(self):
+    def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.shape(exemplary_embedding)
-    def is_unit_vector(self):
+    def is_unit_vector(self) -> bool:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    def embed_query(self, query):
-        client = self.config.get_client()
-        return client.embed_query(str(query))
+    def embed_query(self, query: str) -> list[float]:
+        return self._embed_documents(texts=[query])[0]
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
+    def _embed_documents(self, texts: list[str]) -> list[list[float]]:
         client = self.config.get_client()
-        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+        embeddings = client.encode(texts, **self.config.encode_kwargs)
+        return embeddings.tolist()
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> List[dict]:
+    def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []

unstructured_ingest/embed/interfaces.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Tuple
 from pydantic import BaseModel
@@ -19,7 +18,7 @@ class BaseEmbeddingEncoder(ABC):
     @property
     @abstractmethod
-    def num_of_dimensions(self) -> Tuple[int]:
+    def num_of_dimensions(self) -> tuple[int, ...]:
         """Number of dimensions for the embedding vector."""
     @property
@@ -28,9 +27,17 @@ class BaseEmbeddingEncoder(ABC):
         """Denotes if the embedding vector is a unit vector."""
     @abstractmethod
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
         pass
     @abstractmethod
-    def embed_query(self, query: str) -> List[float]:
+    def embed_query(self, query: str) -> list[float]:
         pass
+    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+        results = []
+        for text in elements:
+            response = self.embed_query(query=text)
+            results.append(response)
+        return results

unstructured_ingest/embed/mixedbreadai.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 import numpy as np
 from pydantic import Field, SecretStr
@@ -67,10 +67,10 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: MixedbreadAIEmbeddingConfig
-    _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
+    _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
     _request_options: Optional["RequestOptions"] = field(init=False, default=None)
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         """Get an exemplary embedding to determine dimensions and unit vector status."""
         return self._embed(["Q"])[0]
@@ -91,7 +91,7 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
         )
     @property
-    def num_of_dimensions(self):
+    def num_of_dimensions(self) -> tuple[int, ...]:
         """Get the number of dimensions for the embeddings."""
         exemplary_embedding = self.get_exemplary_embedding()
         return np.shape(exemplary_embedding)
@@ -102,15 +102,15 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    def _embed(self, texts: List[str]) -> List[List[float]]:
+    def _embed(self, texts: list[str]) -> list[list[float]]:
         """
         Embed a list of texts using the Mixedbread AI API.
         Args:
-            texts (List[str]): List of texts to embed.
+            texts (list[str]): List of texts to embed.
         Returns:
-            List[List[float]]: List of embeddings.
+            list[list[float]]: List of embeddings.
         """
         batch_size = BATCH_SIZE
         batch_itr = range(0, len(texts), batch_size)
@@ -132,17 +132,17 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
     @staticmethod
     def _add_embeddings_to_elements(
-        elements: List[dict], embeddings: List[List[float]]
-    ) -> List[dict]:
+        elements: list[dict], embeddings: list[list[float]]
+    ) -> list[dict]:
         """
         Add embeddings to elements.
         Args:
-            elements (List[Element]): List of elements.
-            embeddings (List[List[float]]): List of embeddings.
+            elements (list[Element]): List of elements.
+            embeddings (list[list[float]]): List of embeddings.
         Returns:
-            List[Element]: Elements with embeddings added.
+            list[Element]: Elements with embeddings added.
         """
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
@@ -151,20 +151,20 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
             elements_w_embedding.append(element)
         return elements
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
         """
         Embed a list of document elements.
         Args:
-            elements (List[Element]): List of document elements.
+            elements (list[Element]): List of document elements.
         Returns:
-            List[Element]: Elements with embeddings.
+            list[Element]: Elements with embeddings.
         """
         embeddings = self._embed([e.get("text", "") for e in elements])
         return self._add_embeddings_to_elements(elements, embeddings)
-    def embed_query(self, query: str) -> List[float]:
+    def embed_query(self, query: str) -> list[float]:
         """
         Embed a query string.
@@ -172,6 +172,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
             query (str): Query string to embed.
         Returns:
-            List[float]: Embedding of the query.
+            list[float]: Embedding of the query.
         """
         return self._embed([query])[0]

unstructured_ingest/embed/octoai.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 import numpy as np
 from pydantic import Field, SecretStr
@@ -31,16 +31,16 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
 class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: OctoAiEmbeddingConfig
     # Uses the OpenAI SDK
-    _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
+    _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query("Q")
-    def num_of_dimensions(self):
+    def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.shape(exemplary_embedding)
-    def is_unit_vector(self):
+    def is_unit_vector(self) -> bool:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
@@ -49,12 +49,12 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
         response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
         return response.data[0].embedding
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
         embeddings = [self.embed_query(e.get("text", "")) for e in elements]
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
         for i, element in enumerate(elements):

unstructured_ingest/embed/openai.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING
 import numpy as np
 from pydantic import Field, SecretStr
@@ -8,51 +8,46 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from langchain_openai.embeddings import OpenAIEmbeddings
+    from openai import OpenAI
 class OpenAIEmbeddingConfig(EmbeddingConfig):
     api_key: SecretStr
     embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
-    @requires_dependencies(["langchain_openai"], extras="openai")
-    def get_client(self) -> "OpenAIEmbeddings":
-        """Creates a langchain OpenAI python client to embed elements."""
-        from langchain_openai import OpenAIEmbeddings
+    @requires_dependencies(["openai"], extras="openai")
+    def get_client(self) -> "OpenAI":
+        from openai import OpenAI
-        openai_client = OpenAIEmbeddings(
-            openai_api_key=self.api_key.get_secret_value(),
-            model=self.embedder_model_name,  # type:ignore
-        )
-        return openai_client
+        return OpenAI(api_key=self.api_key.get_secret_value())
 @dataclass
 class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: OpenAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query(query="Q")
-    def num_of_dimensions(self):
+    def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.shape(exemplary_embedding)
-    def is_unit_vector(self):
+    def is_unit_vector(self) -> bool:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    def embed_query(self, query):
+    def embed_query(self, query: str) -> list[float]:
         client = self.config.get_client()
-        return client.embed_query(str(query))
+        response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
+        return response.data[0].embedding
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
-        client = self.config.get_client()
-        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
         for i, element in enumerate(elements):

unstructured_ingest/embed/vertexai.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, Any, List, Optional
+from typing import TYPE_CHECKING, Annotated, Any, Optional
 import numpy as np
 from pydantic import Field, Secret, ValidationError
@@ -13,7 +13,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from langchain_google_vertexai import VertexAIEmbeddings
+    from vertexai.language_models import TextEmbeddingModel
 def conform_string_to_dict(value: Any) -> dict:
@@ -41,45 +41,53 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
         os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(application_credentials_path)
     @requires_dependencies(
-        ["langchain", "langchain_google_vertexai"],
+        ["vertexai"],
         extras="embed-vertexai",
     )
-    def get_client(self) -> "VertexAIEmbeddings":
+    def get_client(self) -> "TextEmbeddingModel":
         """Creates a Langchain VertexAI python client to embed elements."""
-        from langchain_google_vertexai import VertexAIEmbeddings
+        from vertexai.language_models import TextEmbeddingModel
         self.register_application_credentials()
-        vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
-        return vertexai_client
+        return TextEmbeddingModel.from_pretrained(self.embedder_model_name)
 @dataclass
 class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: VertexAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query(query="A sample query.")
-    def num_of_dimensions(self):
+    def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.shape(exemplary_embedding)
-    def is_unit_vector(self):
+    def is_unit_vector(self) -> bool:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def embed_query(self, query):
-        client = self.config.get_client()
-        result = client.embed_query(str(query))
-        return result
+        return self._embed_documents(elements=[query])[0]
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
-        client = self.config.get_client()
-        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+    @requires_dependencies(
+        ["vertexai"],
+        extras="embed-vertexai",
+    )
+    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+        from vertexai.language_models import TextEmbeddingInput
+        client = self.config.get_client()
+        inputs = [TextEmbeddingInput(text=element) for element in elements]
+        embeddings = client.get_embeddings(inputs)
+        return [e.values for e in embeddings]
+    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
         for i, element in enumerate(elements):

unstructured_ingest/embed/voyageai.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 import numpy as np
 from pydantic import Field, SecretStr
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from langchain_voyageai import VoyageAIEmbeddings
+    from voyageai import Client as VoyageAIClient
 class VoyageAIEmbeddingConfig(EmbeddingConfig):
@@ -16,28 +16,30 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
     embedder_model_name: str = Field(alias="model_name")
     batch_size: Optional[int] = Field(default=None)
     truncation: Optional[bool] = Field(default=None)
+    max_retries: int = 0
+    timeout_in_seconds: Optional[int] = None
     @requires_dependencies(
         ["langchain", "langchain_voyageai"],
         extras="embed-voyageai",
     )
-    def get_client(self) -> "VoyageAIEmbeddings":
+    def get_client(self) -> "VoyageAIClient":
         """Creates a Langchain VoyageAI python client to embed elements."""
-        from langchain_voyageai import VoyageAIEmbeddings
+        from voyageai import Client as VoyageAIClient
-        return VoyageAIEmbeddings(
-            voyage_api_key=self.api_key,
-            model=self.embedder_model_name,
-            batch_size=self.batch_size,
-            truncation=self.truncation,
+        client = VoyageAIClient(
+            api_key=self.api_key.get_secret_value(),
+            max_retries=self.max_retries,
+            timeout=self.timeout_in_seconds,
         )
+        return client
 @dataclass
 class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: VoyageAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query(query="A sample query.")
     @property
@@ -50,17 +52,20 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
-        client = self.config.get_client()
-        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+        client: VoyageAIClient = self.config.get_client()
+        response = client.embed(texts=elements, model=self.config.embedder_model_name)
+        return response.embeddings
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
         return self._add_embeddings_to_elements(elements, embeddings)
-    def embed_query(self, query: str) -> List[float]:
-        client = self.config.get_client()
-        return client.embed_query(query)
+    def embed_query(self, query: str) -> list[float]:
+        return self._embed_documents(elements=[query])[0]
     @staticmethod
-    def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
+    def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
         for i, element in enumerate(elements):

unstructured_ingest/v2/cli/base/cmd.py CHANGED Viewed

@@ -155,7 +155,7 @@ class BaseCmd(ABC):
     @staticmethod
     def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
         filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
-        if not filterer_configs.dict():
+        if not filterer_configs.model_dump():
             return None
         return Filterer(config=filterer_configs)

unstructured_ingest/v2/interfaces/connector.py CHANGED Viewed

@@ -19,7 +19,7 @@ class ConnectionConfig(BaseModel):
     def get_access_config(self) -> dict[str, Any]:
         if not self.access_config:
             return {}
-        return self.access_config.get_secret_value().dict()
+        return self.access_config.get_secret_value().model_dump()
 ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)

unstructured-ingest 0.0.21__py3-none-any.whl → 0.0.22__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.21py3-none-any.whl → 0.0.22py3-none-any.whl