unstructured-ingest 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (28) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/embed/__init__.py +17 -0
  3. unstructured_ingest/embed/bedrock.py +70 -0
  4. unstructured_ingest/embed/huggingface.py +73 -0
  5. unstructured_ingest/embed/interfaces.py +36 -0
  6. unstructured_ingest/embed/mixedbreadai.py +177 -0
  7. unstructured_ingest/embed/octoai.py +63 -0
  8. unstructured_ingest/embed/openai.py +61 -0
  9. unstructured_ingest/embed/vertexai.py +88 -0
  10. unstructured_ingest/embed/voyageai.py +69 -0
  11. unstructured_ingest/interfaces.py +17 -7
  12. unstructured_ingest/pipeline/reformat/embedding.py +3 -5
  13. unstructured_ingest/utils/data_prep.py +21 -8
  14. unstructured_ingest/v2/cli/base/src.py +2 -1
  15. unstructured_ingest/v2/pipeline/interfaces.py +3 -1
  16. unstructured_ingest/v2/pipeline/pipeline.py +25 -23
  17. unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
  18. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -2
  19. unstructured_ingest/v2/processes/connectors/onedrive.py +6 -4
  20. unstructured_ingest/v2/processes/connectors/pinecone.py +73 -32
  21. unstructured_ingest/v2/processes/connectors/sharepoint.py +1 -1
  22. unstructured_ingest/v2/processes/embedder.py +41 -24
  23. {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/METADATA +213 -210
  24. {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/RECORD +28 -19
  25. {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/LICENSE.md +0 -0
  26. {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/WHEEL +0 -0
  27. {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/entry_points.txt +0 -0
  28. {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- __version__ = "0.0.12" # pragma: no cover
1
+ __version__ = "0.0.14" # pragma: no cover
@@ -0,0 +1,17 @@
1
+ from unstructured_ingest.embed.bedrock import BedrockEmbeddingEncoder
2
+ from unstructured_ingest.embed.huggingface import HuggingFaceEmbeddingEncoder
3
+ from unstructured_ingest.embed.mixedbreadai import MixedbreadAIEmbeddingEncoder
4
+ from unstructured_ingest.embed.octoai import OctoAIEmbeddingEncoder
5
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingEncoder
6
+ from unstructured_ingest.embed.vertexai import VertexAIEmbeddingEncoder
7
+ from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingEncoder
8
+
9
+ EMBEDDING_PROVIDER_TO_CLASS_MAP = {
10
+ "langchain-openai": OpenAIEmbeddingEncoder,
11
+ "langchain-huggingface": HuggingFaceEmbeddingEncoder,
12
+ "langchain-aws-bedrock": BedrockEmbeddingEncoder,
13
+ "langchain-vertexai": VertexAIEmbeddingEncoder,
14
+ "langchain-voyageai": VoyageAIEmbeddingEncoder,
15
+ "mixedbread-ai": MixedbreadAIEmbeddingEncoder,
16
+ "octoai": OctoAIEmbeddingEncoder,
17
+ }
@@ -0,0 +1,70 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, List
3
+
4
+ import numpy as np
5
+ from pydantic import SecretStr
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from langchain_community.embeddings import BedrockEmbeddings
12
+
13
+
14
+ class BedrockEmbeddingConfig(EmbeddingConfig):
15
+ aws_access_key_id: SecretStr
16
+ aws_secret_access_key: SecretStr
17
+ region_name: str = "us-west-2"
18
+
19
+ @requires_dependencies(
20
+ ["boto3", "numpy", "langchain_community"],
21
+ extras="bedrock",
22
+ )
23
+ def get_client(self) -> "BedrockEmbeddings":
24
+ # delay import only when needed
25
+ import boto3
26
+ from langchain_community.embeddings import BedrockEmbeddings
27
+
28
+ bedrock_runtime = boto3.client(
29
+ service_name="bedrock-runtime",
30
+ aws_access_key_id=self.aws_access_key_id.get_secret_value(),
31
+ aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
32
+ region_name=self.region_name,
33
+ )
34
+
35
+ bedrock_client = BedrockEmbeddings(client=bedrock_runtime)
36
+ return bedrock_client
37
+
38
+
39
+ @dataclass
40
+ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
41
+ config: BedrockEmbeddingConfig
42
+
43
+ def get_exemplary_embedding(self) -> List[float]:
44
+ return self.embed_query(query="Q")
45
+
46
+ def num_of_dimensions(self):
47
+ exemplary_embedding = self.get_exemplary_embedding()
48
+ return np.shape(exemplary_embedding)
49
+
50
+ def is_unit_vector(self):
51
+ exemplary_embedding = self.get_exemplary_embedding()
52
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
53
+
54
+ def embed_query(self, query):
55
+ bedrock_client = self.config.get_client()
56
+ return np.array(bedrock_client.embed_query(query))
57
+
58
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
59
+ bedrock_client = self.config.get_client()
60
+ embeddings = bedrock_client.embed_documents([e.get("text", "") for e in elements])
61
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
62
+ return elements_with_embeddings
63
+
64
+ def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
65
+ assert len(elements) == len(embeddings)
66
+ elements_w_embedding = []
67
+ for i, element in enumerate(elements):
68
+ element["embeddings"] = embeddings[i]
69
+ elements_w_embedding.append(element)
70
+ return elements
@@ -0,0 +1,73 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import numpy as np
5
+ from pydantic import Field
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
12
+
13
+
14
+ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
15
+ embedder_model_name: Optional[str] = Field(
16
+ default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
17
+ )
18
+ embedder_model_kwargs: Optional[dict] = Field(
19
+ default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
20
+ )
21
+ encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
22
+ cache_folder: Optional[dict] = Field(default=None)
23
+
24
+ @requires_dependencies(
25
+ ["langchain_huggingface"],
26
+ extras="embed-huggingface",
27
+ )
28
+ def get_client(self) -> "HuggingFaceEmbeddings":
29
+ """Creates a langchain Huggingface python client to embed elements."""
30
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
31
+
32
+ client = HuggingFaceEmbeddings(
33
+ model_name=self.embedder_model_name,
34
+ model_kwargs=self.embedder_model_kwargs,
35
+ encode_kwargs=self.encode_kwargs,
36
+ cache_folder=self.cache_folder,
37
+ )
38
+ return client
39
+
40
+
41
+ @dataclass
42
+ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
43
+ config: HuggingFaceEmbeddingConfig
44
+
45
+ def get_exemplary_embedding(self) -> List[float]:
46
+ return self.embed_query(query="Q")
47
+
48
+ def num_of_dimensions(self):
49
+ exemplary_embedding = self.get_exemplary_embedding()
50
+ return np.shape(exemplary_embedding)
51
+
52
+ def is_unit_vector(self):
53
+ exemplary_embedding = self.get_exemplary_embedding()
54
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
55
+
56
+ def embed_query(self, query):
57
+ client = self.config.get_client()
58
+ return client.embed_query(str(query))
59
+
60
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
61
+ client = self.config.get_client()
62
+ embeddings = client.embed_documents([e.get("text", "") for e in elements])
63
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
64
+ return elements_with_embeddings
65
+
66
+ def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> List[dict]:
67
+ assert len(elements) == len(embeddings)
68
+ elements_w_embedding = []
69
+
70
+ for i, element in enumerate(elements):
71
+ element["embeddings"] = embeddings[i]
72
+ elements_w_embedding.append(element)
73
+ return elements
@@ -0,0 +1,36 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import List, Tuple
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class EmbeddingConfig(BaseModel):
9
+ pass
10
+
11
+
12
+ @dataclass
13
+ class BaseEmbeddingEncoder(ABC):
14
+ config: EmbeddingConfig
15
+
16
+ def initialize(self):
17
+ """Initializes the embedding encoder class. Should also validate the instance
18
+ is properly configured: e.g., embed a single a element"""
19
+
20
+ @property
21
+ @abstractmethod
22
+ def num_of_dimensions(self) -> Tuple[int]:
23
+ """Number of dimensions for the embedding vector."""
24
+
25
+ @property
26
+ @abstractmethod
27
+ def is_unit_vector(self) -> bool:
28
+ """Denotes if the embedding vector is a unit vector."""
29
+
30
+ @abstractmethod
31
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
32
+ pass
33
+
34
+ @abstractmethod
35
+ def embed_query(self, query: str) -> List[float]:
36
+ pass
@@ -0,0 +1,177 @@
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, List, Optional
4
+
5
+ import numpy as np
6
+ from pydantic import Field, SecretStr
7
+
8
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
+
11
+ USER_AGENT = "@mixedbread-ai/unstructured"
12
+ BATCH_SIZE = 128
13
+ TIMEOUT = 60
14
+ MAX_RETRIES = 3
15
+ ENCODING_FORMAT = "float"
16
+ TRUNCATION_STRATEGY = "end"
17
+
18
+
19
+ if TYPE_CHECKING:
20
+ from mixedbread_ai.client import MixedbreadAI
21
+ from mixedbread_ai.core import RequestOptions
22
+
23
+
24
+ class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
25
+ """
26
+ Configuration class for Mixedbread AI Embedding Encoder.
27
+
28
+ Attributes:
29
+ api_key (str): API key for accessing Mixedbread AI..
30
+ embedder_model_name (str): Name of the model to use for embeddings.
31
+ """
32
+
33
+ api_key: SecretStr = Field(
34
+ default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
35
+ )
36
+
37
+ embedder_model_name: str = Field(
38
+ default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
39
+ )
40
+
41
+ @requires_dependencies(
42
+ ["mixedbread_ai"],
43
+ extras="embed-mixedbreadai",
44
+ )
45
+ def get_client(self) -> "MixedbreadAI":
46
+ """
47
+ Create the Mixedbread AI client.
48
+
49
+ Returns:
50
+ MixedbreadAI: Initialized client.
51
+ """
52
+ from mixedbread_ai.client import MixedbreadAI
53
+
54
+ return MixedbreadAI(
55
+ api_key=self.api_key.get_secret_value(),
56
+ )
57
+
58
+
59
+ @dataclass
60
+ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
61
+ """
62
+ Embedding encoder for Mixedbread AI.
63
+
64
+ Attributes:
65
+ config (MixedbreadAIEmbeddingConfig): Configuration for the embedding encoder.
66
+ """
67
+
68
+ config: MixedbreadAIEmbeddingConfig
69
+
70
+ _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
71
+ _request_options: Optional["RequestOptions"] = field(init=False, default=None)
72
+
73
+ def get_exemplary_embedding(self) -> List[float]:
74
+ """Get an exemplary embedding to determine dimensions and unit vector status."""
75
+ return self._embed(["Q"])[0]
76
+
77
+ def initialize(self):
78
+ if self.config.api_key is None:
79
+ raise ValueError(
80
+ "The Mixedbread AI API key must be specified."
81
+ + "You either pass it in the constructor using 'api_key'"
82
+ + "or via the 'MXBAI_API_KEY' environment variable."
83
+ )
84
+
85
+ from mixedbread_ai.core import RequestOptions
86
+
87
+ self._request_options = RequestOptions(
88
+ max_retries=MAX_RETRIES,
89
+ timeout_in_seconds=TIMEOUT,
90
+ additional_headers={"User-Agent": USER_AGENT},
91
+ )
92
+
93
+ @property
94
+ def num_of_dimensions(self):
95
+ """Get the number of dimensions for the embeddings."""
96
+ exemplary_embedding = self.get_exemplary_embedding()
97
+ return np.shape(exemplary_embedding)
98
+
99
+ @property
100
+ def is_unit_vector(self) -> bool:
101
+ """Check if the embedding is a unit vector."""
102
+ exemplary_embedding = self.get_exemplary_embedding()
103
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
104
+
105
+ def _embed(self, texts: List[str]) -> List[List[float]]:
106
+ """
107
+ Embed a list of texts using the Mixedbread AI API.
108
+
109
+ Args:
110
+ texts (List[str]): List of texts to embed.
111
+
112
+ Returns:
113
+ List[List[float]]: List of embeddings.
114
+ """
115
+ batch_size = BATCH_SIZE
116
+ batch_itr = range(0, len(texts), batch_size)
117
+
118
+ responses = []
119
+ client = self.config.get_client()
120
+ for i in batch_itr:
121
+ batch = texts[i : i + batch_size]
122
+ response = client.embeddings(
123
+ model=self.config.embedder_model_name,
124
+ normalized=True,
125
+ encoding_format=ENCODING_FORMAT,
126
+ truncation_strategy=TRUNCATION_STRATEGY,
127
+ request_options=self._request_options,
128
+ input=batch,
129
+ )
130
+ responses.append(response)
131
+ return [item.embedding for response in responses for item in response.data]
132
+
133
+ @staticmethod
134
+ def _add_embeddings_to_elements(
135
+ elements: List[dict], embeddings: List[List[float]]
136
+ ) -> List[dict]:
137
+ """
138
+ Add embeddings to elements.
139
+
140
+ Args:
141
+ elements (List[Element]): List of elements.
142
+ embeddings (List[List[float]]): List of embeddings.
143
+
144
+ Returns:
145
+ List[Element]: Elements with embeddings added.
146
+ """
147
+ assert len(elements) == len(embeddings)
148
+ elements_w_embedding = []
149
+ for i, element in enumerate(elements):
150
+ element["embeddings"] = embeddings[i]
151
+ elements_w_embedding.append(element)
152
+ return elements
153
+
154
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
155
+ """
156
+ Embed a list of document elements.
157
+
158
+ Args:
159
+ elements (List[Element]): List of document elements.
160
+
161
+ Returns:
162
+ List[Element]: Elements with embeddings.
163
+ """
164
+ embeddings = self._embed([e.get("text", "") for e in elements])
165
+ return self._add_embeddings_to_elements(elements, embeddings)
166
+
167
+ def embed_query(self, query: str) -> List[float]:
168
+ """
169
+ Embed a query string.
170
+
171
+ Args:
172
+ query (str): Query string to embed.
173
+
174
+ Returns:
175
+ List[float]: Embedding of the query.
176
+ """
177
+ return self._embed([query])[0]
@@ -0,0 +1,63 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import numpy as np
5
+ from pydantic import Field, SecretStr
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from openai import OpenAI
12
+
13
+
14
+ class OctoAiEmbeddingConfig(EmbeddingConfig):
15
+ api_key: SecretStr
16
+ embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
17
+ base_url: str = Field(default="https://text.octoai.run/v1")
18
+
19
+ @requires_dependencies(
20
+ ["openai", "tiktoken"],
21
+ extras="embed-octoai",
22
+ )
23
+ def get_client(self) -> "OpenAI":
24
+ """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
25
+ from openai import OpenAI
26
+
27
+ return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
28
+
29
+
30
+ @dataclass
31
+ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
+ config: OctoAiEmbeddingConfig
33
+ # Uses the OpenAI SDK
34
+ _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
35
+
36
+ def get_exemplary_embedding(self) -> List[float]:
37
+ return self.embed_query("Q")
38
+
39
+ def num_of_dimensions(self):
40
+ exemplary_embedding = self.get_exemplary_embedding()
41
+ return np.shape(exemplary_embedding)
42
+
43
+ def is_unit_vector(self):
44
+ exemplary_embedding = self.get_exemplary_embedding()
45
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
46
+
47
+ def embed_query(self, query: str):
48
+ client = self.config.get_client()
49
+ response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
50
+ return response.data[0].embedding
51
+
52
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
53
+ embeddings = [self.embed_query(e.get("text", "")) for e in elements]
54
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
55
+ return elements_with_embeddings
56
+
57
+ def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
58
+ assert len(elements) == len(embeddings)
59
+ elements_w_embedding = []
60
+ for i, element in enumerate(elements):
61
+ element["embeddings"] = embeddings[i]
62
+ elements_w_embedding.append(element)
63
+ return elements
@@ -0,0 +1,61 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, List
3
+
4
+ import numpy as np
5
+ from pydantic import Field, SecretStr
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from langchain_openai.embeddings import OpenAIEmbeddings
12
+
13
+
14
+ class OpenAIEmbeddingConfig(EmbeddingConfig):
15
+ api_key: SecretStr
16
+ embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
17
+
18
+ @requires_dependencies(["langchain_openai"], extras="openai")
19
+ def get_client(self) -> "OpenAIEmbeddings":
20
+ """Creates a langchain OpenAI python client to embed elements."""
21
+ from langchain_openai import OpenAIEmbeddings
22
+
23
+ openai_client = OpenAIEmbeddings(
24
+ openai_api_key=self.api_key.get_secret_value(),
25
+ model=self.embedder_model_name, # type:ignore
26
+ )
27
+ return openai_client
28
+
29
+
30
+ @dataclass
31
+ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
+ config: OpenAIEmbeddingConfig
33
+
34
+ def get_exemplary_embedding(self) -> List[float]:
35
+ return self.embed_query(query="Q")
36
+
37
+ def num_of_dimensions(self):
38
+ exemplary_embedding = self.get_exemplary_embedding()
39
+ return np.shape(exemplary_embedding)
40
+
41
+ def is_unit_vector(self):
42
+ exemplary_embedding = self.get_exemplary_embedding()
43
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
44
+
45
+ def embed_query(self, query):
46
+ client = self.config.get_client()
47
+ return client.embed_query(str(query))
48
+
49
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
50
+ client = self.config.get_client()
51
+ embeddings = client.embed_documents([e.get("text", "") for e in elements])
52
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
53
+ return elements_with_embeddings
54
+
55
+ def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
56
+ assert len(elements) == len(embeddings)
57
+ elements_w_embedding = []
58
+ for i, element in enumerate(elements):
59
+ element["embeddings"] = embeddings[i]
60
+ elements_w_embedding.append(element)
61
+ return elements
@@ -0,0 +1,88 @@
1
+ # type: ignore
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING, Annotated, Any, List, Optional
6
+
7
+ import numpy as np
8
+ from pydantic import Field, Secret, ValidationError
9
+ from pydantic.functional_validators import BeforeValidator
10
+ from unstructured.utils import FileHandler
11
+
12
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+
15
+ if TYPE_CHECKING:
16
+ from langchain_google_vertexai import VertexAIEmbeddings
17
+
18
+
19
+ def conform_string_to_dict(value: Any) -> dict:
20
+ if isinstance(value, dict):
21
+ return value
22
+ if isinstance(value, str):
23
+ return json.loads(value)
24
+ raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
25
+
26
+
27
+ ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
28
+
29
+
30
+ class VertexAIEmbeddingConfig(EmbeddingConfig):
31
+ api_key: ApiKeyType
32
+ embedder_model_name: Optional[str] = Field(
33
+ default="textembedding-gecko@001", alias="model_name"
34
+ )
35
+
36
+ def register_application_credentials(self):
37
+ # TODO look into passing credentials in directly, rather than via env var and tmp file
38
+ application_credentials_path = os.path.join("/tmp", "google-vertex-app-credentials.json")
39
+ credentials_file = FileHandler(application_credentials_path)
40
+ credentials_file.write_file(json.dumps(self.api_key.get_secret_value()))
41
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = application_credentials_path
42
+
43
+ @requires_dependencies(
44
+ ["langchain", "langchain_google_vertexai"],
45
+ extras="embed-vertexai",
46
+ )
47
+ def get_client(self) -> "VertexAIEmbeddings":
48
+ """Creates a Langchain VertexAI python client to embed elements."""
49
+ from langchain_google_vertexai import VertexAIEmbeddings
50
+
51
+ self.register_application_credentials()
52
+ vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
53
+ return vertexai_client
54
+
55
+
56
+ @dataclass
57
+ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
58
+ config: VertexAIEmbeddingConfig
59
+
60
+ def get_exemplary_embedding(self) -> List[float]:
61
+ return self.embed_query(query="A sample query.")
62
+
63
+ def num_of_dimensions(self):
64
+ exemplary_embedding = self.get_exemplary_embedding()
65
+ return np.shape(exemplary_embedding)
66
+
67
+ def is_unit_vector(self):
68
+ exemplary_embedding = self.get_exemplary_embedding()
69
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
70
+
71
+ def embed_query(self, query):
72
+ client = self.config.get_client()
73
+ result = client.embed_query(str(query))
74
+ return result
75
+
76
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
77
+ client = self.config.get_client()
78
+ embeddings = client.embed_documents([e.get("text", "") for e in elements])
79
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
80
+ return elements_with_embeddings
81
+
82
+ def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
83
+ assert len(elements) == len(embeddings)
84
+ elements_w_embedding = []
85
+ for i, element in enumerate(elements):
86
+ element["embeddings"] = embeddings[i]
87
+ elements_w_embedding.append(element)
88
+ return elements
@@ -0,0 +1,69 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import numpy as np
5
+ from pydantic import Field, SecretStr
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from langchain_voyageai import VoyageAIEmbeddings
12
+
13
+
14
+ class VoyageAIEmbeddingConfig(EmbeddingConfig):
15
+ api_key: SecretStr
16
+ embedder_model_name: str = Field(alias="model_name")
17
+ batch_size: Optional[int] = Field(default=None)
18
+ truncation: Optional[bool] = Field(default=None)
19
+
20
+ @requires_dependencies(
21
+ ["langchain", "langchain_voyageai"],
22
+ extras="embed-voyageai",
23
+ )
24
+ def get_client(self) -> "VoyageAIEmbeddings":
25
+ """Creates a Langchain VoyageAI python client to embed elements."""
26
+ from langchain_voyageai import VoyageAIEmbeddings
27
+
28
+ return VoyageAIEmbeddings(
29
+ voyage_api_key=self.api_key,
30
+ model=self.embedder_model_name,
31
+ batch_size=self.batch_size,
32
+ truncation=self.truncation,
33
+ )
34
+
35
+
36
+ @dataclass
37
+ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
38
+ config: VoyageAIEmbeddingConfig
39
+
40
+ def get_exemplary_embedding(self) -> List[float]:
41
+ return self.embed_query(query="A sample query.")
42
+
43
+ @property
44
+ def num_of_dimensions(self) -> tuple[int, ...]:
45
+ exemplary_embedding = self.get_exemplary_embedding()
46
+ return np.shape(exemplary_embedding)
47
+
48
+ @property
49
+ def is_unit_vector(self) -> bool:
50
+ exemplary_embedding = self.get_exemplary_embedding()
51
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
+
53
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
54
+ client = self.config.get_client()
55
+ embeddings = client.embed_documents([e.get("text", "") for e in elements])
56
+ return self._add_embeddings_to_elements(elements, embeddings)
57
+
58
+ def embed_query(self, query: str) -> List[float]:
59
+ client = self.config.get_client()
60
+ return client.embed_query(query)
61
+
62
+ @staticmethod
63
+ def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
64
+ assert len(elements) == len(embeddings)
65
+ elements_w_embedding = []
66
+ for i, element in enumerate(elements):
67
+ element["embeddings"] = embeddings[i]
68
+ elements_w_embedding.append(element)
69
+ return elements