unstructured-ingest 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/__init__.py +17 -0
- unstructured_ingest/embed/bedrock.py +70 -0
- unstructured_ingest/embed/huggingface.py +73 -0
- unstructured_ingest/embed/interfaces.py +36 -0
- unstructured_ingest/embed/mixedbreadai.py +177 -0
- unstructured_ingest/embed/octoai.py +63 -0
- unstructured_ingest/embed/openai.py +61 -0
- unstructured_ingest/embed/vertexai.py +88 -0
- unstructured_ingest/embed/voyageai.py +69 -0
- unstructured_ingest/interfaces.py +17 -7
- unstructured_ingest/pipeline/reformat/embedding.py +3 -5
- unstructured_ingest/utils/data_prep.py +21 -8
- unstructured_ingest/v2/cli/base/src.py +2 -1
- unstructured_ingest/v2/pipeline/interfaces.py +3 -1
- unstructured_ingest/v2/pipeline/pipeline.py +25 -23
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -2
- unstructured_ingest/v2/processes/connectors/onedrive.py +6 -4
- unstructured_ingest/v2/processes/connectors/pinecone.py +73 -32
- unstructured_ingest/v2/processes/connectors/sharepoint.py +1 -1
- unstructured_ingest/v2/processes/embedder.py +41 -24
- {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/METADATA +213 -210
- {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/RECORD +28 -19
- {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.12.dist-info → unstructured_ingest-0.0.14.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.14" # pragma: no cover
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from unstructured_ingest.embed.bedrock import BedrockEmbeddingEncoder
|
|
2
|
+
from unstructured_ingest.embed.huggingface import HuggingFaceEmbeddingEncoder
|
|
3
|
+
from unstructured_ingest.embed.mixedbreadai import MixedbreadAIEmbeddingEncoder
|
|
4
|
+
from unstructured_ingest.embed.octoai import OctoAIEmbeddingEncoder
|
|
5
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingEncoder
|
|
6
|
+
from unstructured_ingest.embed.vertexai import VertexAIEmbeddingEncoder
|
|
7
|
+
from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingEncoder
|
|
8
|
+
|
|
9
|
+
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
|
|
10
|
+
"langchain-openai": OpenAIEmbeddingEncoder,
|
|
11
|
+
"langchain-huggingface": HuggingFaceEmbeddingEncoder,
|
|
12
|
+
"langchain-aws-bedrock": BedrockEmbeddingEncoder,
|
|
13
|
+
"langchain-vertexai": VertexAIEmbeddingEncoder,
|
|
14
|
+
"langchain-voyageai": VoyageAIEmbeddingEncoder,
|
|
15
|
+
"mixedbread-ai": MixedbreadAIEmbeddingEncoder,
|
|
16
|
+
"octoai": OctoAIEmbeddingEncoder,
|
|
17
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING, List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from langchain_community.embeddings import BedrockEmbeddings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
aws_access_key_id: SecretStr
|
|
16
|
+
aws_secret_access_key: SecretStr
|
|
17
|
+
region_name: str = "us-west-2"
|
|
18
|
+
|
|
19
|
+
@requires_dependencies(
|
|
20
|
+
["boto3", "numpy", "langchain_community"],
|
|
21
|
+
extras="bedrock",
|
|
22
|
+
)
|
|
23
|
+
def get_client(self) -> "BedrockEmbeddings":
|
|
24
|
+
# delay import only when needed
|
|
25
|
+
import boto3
|
|
26
|
+
from langchain_community.embeddings import BedrockEmbeddings
|
|
27
|
+
|
|
28
|
+
bedrock_runtime = boto3.client(
|
|
29
|
+
service_name="bedrock-runtime",
|
|
30
|
+
aws_access_key_id=self.aws_access_key_id.get_secret_value(),
|
|
31
|
+
aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
|
|
32
|
+
region_name=self.region_name,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
bedrock_client = BedrockEmbeddings(client=bedrock_runtime)
|
|
36
|
+
return bedrock_client
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
41
|
+
config: BedrockEmbeddingConfig
|
|
42
|
+
|
|
43
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
44
|
+
return self.embed_query(query="Q")
|
|
45
|
+
|
|
46
|
+
def num_of_dimensions(self):
|
|
47
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
48
|
+
return np.shape(exemplary_embedding)
|
|
49
|
+
|
|
50
|
+
def is_unit_vector(self):
|
|
51
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
52
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
53
|
+
|
|
54
|
+
def embed_query(self, query):
|
|
55
|
+
bedrock_client = self.config.get_client()
|
|
56
|
+
return np.array(bedrock_client.embed_query(query))
|
|
57
|
+
|
|
58
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
59
|
+
bedrock_client = self.config.get_client()
|
|
60
|
+
embeddings = bedrock_client.embed_documents([e.get("text", "") for e in elements])
|
|
61
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
62
|
+
return elements_with_embeddings
|
|
63
|
+
|
|
64
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
|
|
65
|
+
assert len(elements) == len(embeddings)
|
|
66
|
+
elements_w_embedding = []
|
|
67
|
+
for i, element in enumerate(elements):
|
|
68
|
+
element["embeddings"] = embeddings[i]
|
|
69
|
+
elements_w_embedding.append(element)
|
|
70
|
+
return elements
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import Field
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
embedder_model_name: Optional[str] = Field(
|
|
16
|
+
default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
|
|
17
|
+
)
|
|
18
|
+
embedder_model_kwargs: Optional[dict] = Field(
|
|
19
|
+
default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
|
|
20
|
+
)
|
|
21
|
+
encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
|
|
22
|
+
cache_folder: Optional[dict] = Field(default=None)
|
|
23
|
+
|
|
24
|
+
@requires_dependencies(
|
|
25
|
+
["langchain_huggingface"],
|
|
26
|
+
extras="embed-huggingface",
|
|
27
|
+
)
|
|
28
|
+
def get_client(self) -> "HuggingFaceEmbeddings":
|
|
29
|
+
"""Creates a langchain Huggingface python client to embed elements."""
|
|
30
|
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
|
31
|
+
|
|
32
|
+
client = HuggingFaceEmbeddings(
|
|
33
|
+
model_name=self.embedder_model_name,
|
|
34
|
+
model_kwargs=self.embedder_model_kwargs,
|
|
35
|
+
encode_kwargs=self.encode_kwargs,
|
|
36
|
+
cache_folder=self.cache_folder,
|
|
37
|
+
)
|
|
38
|
+
return client
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
43
|
+
config: HuggingFaceEmbeddingConfig
|
|
44
|
+
|
|
45
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
46
|
+
return self.embed_query(query="Q")
|
|
47
|
+
|
|
48
|
+
def num_of_dimensions(self):
|
|
49
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
50
|
+
return np.shape(exemplary_embedding)
|
|
51
|
+
|
|
52
|
+
def is_unit_vector(self):
|
|
53
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
54
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
55
|
+
|
|
56
|
+
def embed_query(self, query):
|
|
57
|
+
client = self.config.get_client()
|
|
58
|
+
return client.embed_query(str(query))
|
|
59
|
+
|
|
60
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
61
|
+
client = self.config.get_client()
|
|
62
|
+
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
63
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
64
|
+
return elements_with_embeddings
|
|
65
|
+
|
|
66
|
+
def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> List[dict]:
|
|
67
|
+
assert len(elements) == len(embeddings)
|
|
68
|
+
elements_w_embedding = []
|
|
69
|
+
|
|
70
|
+
for i, element in enumerate(elements):
|
|
71
|
+
element["embeddings"] = embeddings[i]
|
|
72
|
+
elements_w_embedding.append(element)
|
|
73
|
+
return elements
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EmbeddingConfig(BaseModel):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class BaseEmbeddingEncoder(ABC):
|
|
14
|
+
config: EmbeddingConfig
|
|
15
|
+
|
|
16
|
+
def initialize(self):
|
|
17
|
+
"""Initializes the embedding encoder class. Should also validate the instance
|
|
18
|
+
is properly configured: e.g., embed a single a element"""
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def num_of_dimensions(self) -> Tuple[int]:
|
|
23
|
+
"""Number of dimensions for the embedding vector."""
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def is_unit_vector(self) -> bool:
|
|
28
|
+
"""Denotes if the embedding vector is a unit vector."""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def embed_query(self, query: str) -> List[float]:
|
|
36
|
+
pass
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pydantic import Field, SecretStr
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
|
|
11
|
+
USER_AGENT = "@mixedbread-ai/unstructured"
|
|
12
|
+
BATCH_SIZE = 128
|
|
13
|
+
TIMEOUT = 60
|
|
14
|
+
MAX_RETRIES = 3
|
|
15
|
+
ENCODING_FORMAT = "float"
|
|
16
|
+
TRUNCATION_STRATEGY = "end"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from mixedbread_ai.client import MixedbreadAI
|
|
21
|
+
from mixedbread_ai.core import RequestOptions
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
|
|
25
|
+
"""
|
|
26
|
+
Configuration class for Mixedbread AI Embedding Encoder.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
api_key (str): API key for accessing Mixedbread AI..
|
|
30
|
+
embedder_model_name (str): Name of the model to use for embeddings.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
api_key: SecretStr = Field(
|
|
34
|
+
default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
embedder_model_name: str = Field(
|
|
38
|
+
default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@requires_dependencies(
|
|
42
|
+
["mixedbread_ai"],
|
|
43
|
+
extras="embed-mixedbreadai",
|
|
44
|
+
)
|
|
45
|
+
def get_client(self) -> "MixedbreadAI":
|
|
46
|
+
"""
|
|
47
|
+
Create the Mixedbread AI client.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
MixedbreadAI: Initialized client.
|
|
51
|
+
"""
|
|
52
|
+
from mixedbread_ai.client import MixedbreadAI
|
|
53
|
+
|
|
54
|
+
return MixedbreadAI(
|
|
55
|
+
api_key=self.api_key.get_secret_value(),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
61
|
+
"""
|
|
62
|
+
Embedding encoder for Mixedbread AI.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
config (MixedbreadAIEmbeddingConfig): Configuration for the embedding encoder.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
config: MixedbreadAIEmbeddingConfig
|
|
69
|
+
|
|
70
|
+
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
|
|
71
|
+
_request_options: Optional["RequestOptions"] = field(init=False, default=None)
|
|
72
|
+
|
|
73
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
74
|
+
"""Get an exemplary embedding to determine dimensions and unit vector status."""
|
|
75
|
+
return self._embed(["Q"])[0]
|
|
76
|
+
|
|
77
|
+
def initialize(self):
|
|
78
|
+
if self.config.api_key is None:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
"The Mixedbread AI API key must be specified."
|
|
81
|
+
+ "You either pass it in the constructor using 'api_key'"
|
|
82
|
+
+ "or via the 'MXBAI_API_KEY' environment variable."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
from mixedbread_ai.core import RequestOptions
|
|
86
|
+
|
|
87
|
+
self._request_options = RequestOptions(
|
|
88
|
+
max_retries=MAX_RETRIES,
|
|
89
|
+
timeout_in_seconds=TIMEOUT,
|
|
90
|
+
additional_headers={"User-Agent": USER_AGENT},
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def num_of_dimensions(self):
|
|
95
|
+
"""Get the number of dimensions for the embeddings."""
|
|
96
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
97
|
+
return np.shape(exemplary_embedding)
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def is_unit_vector(self) -> bool:
|
|
101
|
+
"""Check if the embedding is a unit vector."""
|
|
102
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
103
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
104
|
+
|
|
105
|
+
def _embed(self, texts: List[str]) -> List[List[float]]:
|
|
106
|
+
"""
|
|
107
|
+
Embed a list of texts using the Mixedbread AI API.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
texts (List[str]): List of texts to embed.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
List[List[float]]: List of embeddings.
|
|
114
|
+
"""
|
|
115
|
+
batch_size = BATCH_SIZE
|
|
116
|
+
batch_itr = range(0, len(texts), batch_size)
|
|
117
|
+
|
|
118
|
+
responses = []
|
|
119
|
+
client = self.config.get_client()
|
|
120
|
+
for i in batch_itr:
|
|
121
|
+
batch = texts[i : i + batch_size]
|
|
122
|
+
response = client.embeddings(
|
|
123
|
+
model=self.config.embedder_model_name,
|
|
124
|
+
normalized=True,
|
|
125
|
+
encoding_format=ENCODING_FORMAT,
|
|
126
|
+
truncation_strategy=TRUNCATION_STRATEGY,
|
|
127
|
+
request_options=self._request_options,
|
|
128
|
+
input=batch,
|
|
129
|
+
)
|
|
130
|
+
responses.append(response)
|
|
131
|
+
return [item.embedding for response in responses for item in response.data]
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _add_embeddings_to_elements(
|
|
135
|
+
elements: List[dict], embeddings: List[List[float]]
|
|
136
|
+
) -> List[dict]:
|
|
137
|
+
"""
|
|
138
|
+
Add embeddings to elements.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
elements (List[Element]): List of elements.
|
|
142
|
+
embeddings (List[List[float]]): List of embeddings.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List[Element]: Elements with embeddings added.
|
|
146
|
+
"""
|
|
147
|
+
assert len(elements) == len(embeddings)
|
|
148
|
+
elements_w_embedding = []
|
|
149
|
+
for i, element in enumerate(elements):
|
|
150
|
+
element["embeddings"] = embeddings[i]
|
|
151
|
+
elements_w_embedding.append(element)
|
|
152
|
+
return elements
|
|
153
|
+
|
|
154
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
155
|
+
"""
|
|
156
|
+
Embed a list of document elements.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
elements (List[Element]): List of document elements.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List[Element]: Elements with embeddings.
|
|
163
|
+
"""
|
|
164
|
+
embeddings = self._embed([e.get("text", "") for e in elements])
|
|
165
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
166
|
+
|
|
167
|
+
def embed_query(self, query: str) -> List[float]:
|
|
168
|
+
"""
|
|
169
|
+
Embed a query string.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
query (str): Query string to embed.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List[float]: Embedding of the query.
|
|
176
|
+
"""
|
|
177
|
+
return self._embed([query])[0]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import Field, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from openai import OpenAI
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
api_key: SecretStr
|
|
16
|
+
embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
|
|
17
|
+
base_url: str = Field(default="https://text.octoai.run/v1")
|
|
18
|
+
|
|
19
|
+
@requires_dependencies(
|
|
20
|
+
["openai", "tiktoken"],
|
|
21
|
+
extras="embed-octoai",
|
|
22
|
+
)
|
|
23
|
+
def get_client(self) -> "OpenAI":
|
|
24
|
+
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
|
|
25
|
+
from openai import OpenAI
|
|
26
|
+
|
|
27
|
+
return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
32
|
+
config: OctoAiEmbeddingConfig
|
|
33
|
+
# Uses the OpenAI SDK
|
|
34
|
+
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
|
|
35
|
+
|
|
36
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
37
|
+
return self.embed_query("Q")
|
|
38
|
+
|
|
39
|
+
def num_of_dimensions(self):
|
|
40
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
41
|
+
return np.shape(exemplary_embedding)
|
|
42
|
+
|
|
43
|
+
def is_unit_vector(self):
|
|
44
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
45
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
46
|
+
|
|
47
|
+
def embed_query(self, query: str):
|
|
48
|
+
client = self.config.get_client()
|
|
49
|
+
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
50
|
+
return response.data[0].embedding
|
|
51
|
+
|
|
52
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
53
|
+
embeddings = [self.embed_query(e.get("text", "")) for e in elements]
|
|
54
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
55
|
+
return elements_with_embeddings
|
|
56
|
+
|
|
57
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
|
|
58
|
+
assert len(elements) == len(embeddings)
|
|
59
|
+
elements_w_embedding = []
|
|
60
|
+
for i, element in enumerate(elements):
|
|
61
|
+
element["embeddings"] = embeddings[i]
|
|
62
|
+
elements_w_embedding.append(element)
|
|
63
|
+
return elements
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING, List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import Field, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
api_key: SecretStr
|
|
16
|
+
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
17
|
+
|
|
18
|
+
@requires_dependencies(["langchain_openai"], extras="openai")
|
|
19
|
+
def get_client(self) -> "OpenAIEmbeddings":
|
|
20
|
+
"""Creates a langchain OpenAI python client to embed elements."""
|
|
21
|
+
from langchain_openai import OpenAIEmbeddings
|
|
22
|
+
|
|
23
|
+
openai_client = OpenAIEmbeddings(
|
|
24
|
+
openai_api_key=self.api_key.get_secret_value(),
|
|
25
|
+
model=self.embedder_model_name, # type:ignore
|
|
26
|
+
)
|
|
27
|
+
return openai_client
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
32
|
+
config: OpenAIEmbeddingConfig
|
|
33
|
+
|
|
34
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
35
|
+
return self.embed_query(query="Q")
|
|
36
|
+
|
|
37
|
+
def num_of_dimensions(self):
|
|
38
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
39
|
+
return np.shape(exemplary_embedding)
|
|
40
|
+
|
|
41
|
+
def is_unit_vector(self):
|
|
42
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
43
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
44
|
+
|
|
45
|
+
def embed_query(self, query):
|
|
46
|
+
client = self.config.get_client()
|
|
47
|
+
return client.embed_query(str(query))
|
|
48
|
+
|
|
49
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
50
|
+
client = self.config.get_client()
|
|
51
|
+
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
52
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
53
|
+
return elements_with_embeddings
|
|
54
|
+
|
|
55
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
|
|
56
|
+
assert len(elements) == len(embeddings)
|
|
57
|
+
elements_w_embedding = []
|
|
58
|
+
for i, element in enumerate(elements):
|
|
59
|
+
element["embeddings"] = embeddings[i]
|
|
60
|
+
elements_w_embedding.append(element)
|
|
61
|
+
return elements
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated, Any, List, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from pydantic import Field, Secret, ValidationError
|
|
9
|
+
from pydantic.functional_validators import BeforeValidator
|
|
10
|
+
from unstructured.utils import FileHandler
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from langchain_google_vertexai import VertexAIEmbeddings
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def conform_string_to_dict(value: Any) -> dict:
|
|
20
|
+
if isinstance(value, dict):
|
|
21
|
+
return value
|
|
22
|
+
if isinstance(value, str):
|
|
23
|
+
return json.loads(value)
|
|
24
|
+
raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class VertexAIEmbeddingConfig(EmbeddingConfig):
|
|
31
|
+
api_key: ApiKeyType
|
|
32
|
+
embedder_model_name: Optional[str] = Field(
|
|
33
|
+
default="textembedding-gecko@001", alias="model_name"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def register_application_credentials(self):
|
|
37
|
+
# TODO look into passing credentials in directly, rather than via env var and tmp file
|
|
38
|
+
application_credentials_path = os.path.join("/tmp", "google-vertex-app-credentials.json")
|
|
39
|
+
credentials_file = FileHandler(application_credentials_path)
|
|
40
|
+
credentials_file.write_file(json.dumps(self.api_key.get_secret_value()))
|
|
41
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = application_credentials_path
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(
|
|
44
|
+
["langchain", "langchain_google_vertexai"],
|
|
45
|
+
extras="embed-vertexai",
|
|
46
|
+
)
|
|
47
|
+
def get_client(self) -> "VertexAIEmbeddings":
|
|
48
|
+
"""Creates a Langchain VertexAI python client to embed elements."""
|
|
49
|
+
from langchain_google_vertexai import VertexAIEmbeddings
|
|
50
|
+
|
|
51
|
+
self.register_application_credentials()
|
|
52
|
+
vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
|
|
53
|
+
return vertexai_client
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
58
|
+
config: VertexAIEmbeddingConfig
|
|
59
|
+
|
|
60
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
61
|
+
return self.embed_query(query="A sample query.")
|
|
62
|
+
|
|
63
|
+
def num_of_dimensions(self):
|
|
64
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
65
|
+
return np.shape(exemplary_embedding)
|
|
66
|
+
|
|
67
|
+
def is_unit_vector(self):
|
|
68
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
69
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
70
|
+
|
|
71
|
+
def embed_query(self, query):
|
|
72
|
+
client = self.config.get_client()
|
|
73
|
+
result = client.embed_query(str(query))
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
77
|
+
client = self.config.get_client()
|
|
78
|
+
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
79
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
80
|
+
return elements_with_embeddings
|
|
81
|
+
|
|
82
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
|
|
83
|
+
assert len(elements) == len(embeddings)
|
|
84
|
+
elements_w_embedding = []
|
|
85
|
+
for i, element in enumerate(elements):
|
|
86
|
+
element["embeddings"] = embeddings[i]
|
|
87
|
+
elements_w_embedding.append(element)
|
|
88
|
+
return elements
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import Field, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from langchain_voyageai import VoyageAIEmbeddings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
api_key: SecretStr
|
|
16
|
+
embedder_model_name: str = Field(alias="model_name")
|
|
17
|
+
batch_size: Optional[int] = Field(default=None)
|
|
18
|
+
truncation: Optional[bool] = Field(default=None)
|
|
19
|
+
|
|
20
|
+
@requires_dependencies(
|
|
21
|
+
["langchain", "langchain_voyageai"],
|
|
22
|
+
extras="embed-voyageai",
|
|
23
|
+
)
|
|
24
|
+
def get_client(self) -> "VoyageAIEmbeddings":
|
|
25
|
+
"""Creates a Langchain VoyageAI python client to embed elements."""
|
|
26
|
+
from langchain_voyageai import VoyageAIEmbeddings
|
|
27
|
+
|
|
28
|
+
return VoyageAIEmbeddings(
|
|
29
|
+
voyage_api_key=self.api_key,
|
|
30
|
+
model=self.embedder_model_name,
|
|
31
|
+
batch_size=self.batch_size,
|
|
32
|
+
truncation=self.truncation,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
38
|
+
config: VoyageAIEmbeddingConfig
|
|
39
|
+
|
|
40
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
41
|
+
return self.embed_query(query="A sample query.")
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
45
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
46
|
+
return np.shape(exemplary_embedding)
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def is_unit_vector(self) -> bool:
|
|
50
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
51
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
52
|
+
|
|
53
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
54
|
+
client = self.config.get_client()
|
|
55
|
+
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
56
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
57
|
+
|
|
58
|
+
def embed_query(self, query: str) -> List[float]:
|
|
59
|
+
client = self.config.get_client()
|
|
60
|
+
return client.embed_query(query)
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
|
|
64
|
+
assert len(elements) == len(embeddings)
|
|
65
|
+
elements_w_embedding = []
|
|
66
|
+
for i, element in enumerate(elements):
|
|
67
|
+
element["embeddings"] = embeddings[i]
|
|
68
|
+
elements_w_embedding.append(element)
|
|
69
|
+
return elements
|