unstructured-ingest 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_sharepoint.py +161 -10
- test/unit/v2/embedders/test_bedrock.py +1 -1
- test/unit/v2/embedders/test_huggingface.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/azure_openai.py +6 -0
- unstructured_ingest/embed/bedrock.py +16 -6
- unstructured_ingest/embed/huggingface.py +3 -1
- unstructured_ingest/embed/interfaces.py +61 -23
- unstructured_ingest/embed/mixedbreadai.py +28 -114
- unstructured_ingest/embed/octoai.py +19 -51
- unstructured_ingest/embed/openai.py +17 -55
- unstructured_ingest/embed/togetherai.py +16 -58
- unstructured_ingest/embed/vertexai.py +15 -46
- unstructured_ingest/embed/voyageai.py +17 -52
- unstructured_ingest/v2/errors.py +7 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +129 -43
- unstructured_ingest/v2/processes/connectors/sharepoint.py +9 -4
- unstructured_ingest/v2/processes/embedder.py +9 -7
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/METADATA +101 -89
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/RECORD +24 -24
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/WHEEL +1 -1
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -4,13 +4,11 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
-
EMBEDDINGS_KEY,
|
|
8
7
|
AsyncBaseEmbeddingEncoder,
|
|
9
8
|
BaseEmbeddingEncoder,
|
|
10
9
|
EmbeddingConfig,
|
|
11
10
|
)
|
|
12
11
|
from unstructured_ingest.logger import logger
|
|
13
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
14
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
13
|
from unstructured_ingest.v2.errors import (
|
|
16
14
|
ProviderError,
|
|
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.errors import (
|
|
|
18
16
|
RateLimitError,
|
|
19
17
|
UserAuthError,
|
|
20
18
|
UserError,
|
|
19
|
+
is_internal_error,
|
|
21
20
|
)
|
|
22
21
|
|
|
23
22
|
if TYPE_CHECKING:
|
|
@@ -30,6 +29,8 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
|
30
29
|
base_url: str = Field(default="https://text.octoai.run/v1")
|
|
31
30
|
|
|
32
31
|
def wrap_error(self, e: Exception) -> Exception:
|
|
32
|
+
if is_internal_error(e=e):
|
|
33
|
+
return e
|
|
33
34
|
# https://platform.openai.com/docs/guides/error-codes/api-errors
|
|
34
35
|
from openai import APIStatusError
|
|
35
36
|
|
|
@@ -81,31 +82,17 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
81
82
|
def wrap_error(self, e: Exception) -> Exception:
|
|
82
83
|
return self.config.wrap_error(e=e)
|
|
83
84
|
|
|
84
|
-
def
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
88
|
-
except Exception as e:
|
|
89
|
-
raise self.wrap_error(e=e)
|
|
85
|
+
def _embed_query(self, query: str):
|
|
86
|
+
client = self.get_client()
|
|
87
|
+
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
90
88
|
return response.data[0].embedding
|
|
91
89
|
|
|
92
|
-
def
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
embeddings =
|
|
97
|
-
|
|
98
|
-
try:
|
|
99
|
-
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
100
|
-
response = client.embeddings.create(
|
|
101
|
-
input=batch, model=self.config.embedder_model_name
|
|
102
|
-
)
|
|
103
|
-
embeddings.extend([data.embedding for data in response.data])
|
|
104
|
-
except Exception as e:
|
|
105
|
-
raise self.wrap_error(e=e)
|
|
106
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
107
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
108
|
-
return elements
|
|
90
|
+
def get_client(self) -> "OpenAI":
|
|
91
|
+
return self.config.get_client()
|
|
92
|
+
|
|
93
|
+
def embed_batch(self, client: "OpenAI", batch: list[str]) -> list[list[float]]:
|
|
94
|
+
response = client.embeddings.create(input=batch, model=self.config.embedder_model_name)
|
|
95
|
+
return [data.embedding for data in response.data]
|
|
109
96
|
|
|
110
97
|
|
|
111
98
|
@dataclass
|
|
@@ -115,30 +102,11 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
115
102
|
def wrap_error(self, e: Exception) -> Exception:
|
|
116
103
|
return self.config.wrap_error(e=e)
|
|
117
104
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
try:
|
|
121
|
-
response = await client.embeddings.create(
|
|
122
|
-
input=query, model=self.config.embedder_model_name
|
|
123
|
-
)
|
|
124
|
-
except Exception as e:
|
|
125
|
-
raise self.wrap_error(e=e)
|
|
126
|
-
return response.data[0].embedding
|
|
105
|
+
def get_client(self) -> "AsyncOpenAI":
|
|
106
|
+
return self.config.get_async_client()
|
|
127
107
|
|
|
128
|
-
async def
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
embeddings = []
|
|
134
|
-
try:
|
|
135
|
-
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
136
|
-
response = await client.embeddings.create(
|
|
137
|
-
input=batch, model=self.config.embedder_model_name
|
|
138
|
-
)
|
|
139
|
-
embeddings.extend([data.embedding for data in response.data])
|
|
140
|
-
except Exception as e:
|
|
141
|
-
raise self.wrap_error(e=e)
|
|
142
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
143
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
144
|
-
return elements
|
|
108
|
+
async def embed_batch(self, client: "AsyncOpenAI", batch: list[str]) -> list[list[float]]:
|
|
109
|
+
response = await client.embeddings.create(
|
|
110
|
+
input=batch, model=self.config.embedder_model_name
|
|
111
|
+
)
|
|
112
|
+
return [data.embedding for data in response.data]
|
|
@@ -4,13 +4,11 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
-
EMBEDDINGS_KEY,
|
|
8
7
|
AsyncBaseEmbeddingEncoder,
|
|
9
8
|
BaseEmbeddingEncoder,
|
|
10
9
|
EmbeddingConfig,
|
|
11
10
|
)
|
|
12
11
|
from unstructured_ingest.logger import logger
|
|
13
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
14
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
13
|
from unstructured_ingest.v2.errors import (
|
|
16
14
|
ProviderError,
|
|
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.errors import (
|
|
|
18
16
|
RateLimitError,
|
|
19
17
|
UserAuthError,
|
|
20
18
|
UserError,
|
|
19
|
+
is_internal_error,
|
|
21
20
|
)
|
|
22
21
|
|
|
23
22
|
if TYPE_CHECKING:
|
|
@@ -29,6 +28,8 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
29
28
|
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
30
29
|
|
|
31
30
|
def wrap_error(self, e: Exception) -> Exception:
|
|
31
|
+
if is_internal_error(e=e):
|
|
32
|
+
return e
|
|
32
33
|
# https://platform.openai.com/docs/guides/error-codes/api-errors
|
|
33
34
|
from openai import APIStatusError
|
|
34
35
|
|
|
@@ -72,32 +73,12 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
72
73
|
def wrap_error(self, e: Exception) -> Exception:
|
|
73
74
|
return self.config.wrap_error(e=e)
|
|
74
75
|
|
|
75
|
-
def
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
raise self.wrap_error(e=e)
|
|
82
|
-
return response.data[0].embedding
|
|
83
|
-
|
|
84
|
-
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
85
|
-
client = self.config.get_client()
|
|
86
|
-
elements = elements.copy()
|
|
87
|
-
elements_with_text = [e for e in elements if e.get("text")]
|
|
88
|
-
texts = [e["text"] for e in elements_with_text]
|
|
89
|
-
embeddings = []
|
|
90
|
-
try:
|
|
91
|
-
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
92
|
-
response = client.embeddings.create(
|
|
93
|
-
input=batch, model=self.config.embedder_model_name
|
|
94
|
-
)
|
|
95
|
-
embeddings.extend([data.embedding for data in response.data])
|
|
96
|
-
except Exception as e:
|
|
97
|
-
raise self.wrap_error(e=e)
|
|
98
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
99
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
100
|
-
return elements
|
|
76
|
+
def get_client(self) -> "OpenAI":
|
|
77
|
+
return self.config.get_client()
|
|
78
|
+
|
|
79
|
+
def embed_batch(self, client: "OpenAI", batch: list[str]) -> list[list[float]]:
|
|
80
|
+
response = client.embeddings.create(input=batch, model=self.config.embedder_model_name)
|
|
81
|
+
return [data.embedding for data in response.data]
|
|
101
82
|
|
|
102
83
|
|
|
103
84
|
@dataclass
|
|
@@ -107,30 +88,11 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
107
88
|
def wrap_error(self, e: Exception) -> Exception:
|
|
108
89
|
return self.config.wrap_error(e=e)
|
|
109
90
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
return response.data[0].embedding
|
|
119
|
-
|
|
120
|
-
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
121
|
-
client = self.config.get_async_client()
|
|
122
|
-
elements = elements.copy()
|
|
123
|
-
elements_with_text = [e for e in elements if e.get("text")]
|
|
124
|
-
texts = [e["text"] for e in elements_with_text]
|
|
125
|
-
embeddings = []
|
|
126
|
-
try:
|
|
127
|
-
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
128
|
-
response = await client.embeddings.create(
|
|
129
|
-
input=batch, model=self.config.embedder_model_name
|
|
130
|
-
)
|
|
131
|
-
embeddings.extend([data.embedding for data in response.data])
|
|
132
|
-
except Exception as e:
|
|
133
|
-
raise self.wrap_error(e=e)
|
|
134
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
135
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
136
|
-
return elements
|
|
91
|
+
def get_client(self) -> "AsyncOpenAI":
|
|
92
|
+
return self.config.get_async_client()
|
|
93
|
+
|
|
94
|
+
async def embed_batch(self, client: "AsyncOpenAI", batch: list[str]) -> list[list[float]]:
|
|
95
|
+
response = await client.embeddings.create(
|
|
96
|
+
input=batch, model=self.config.embedder_model_name
|
|
97
|
+
)
|
|
98
|
+
return [data.embedding for data in response.data]
|
|
@@ -1,24 +1,19 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
-
EMBEDDINGS_KEY,
|
|
8
7
|
AsyncBaseEmbeddingEncoder,
|
|
9
8
|
BaseEmbeddingEncoder,
|
|
10
9
|
EmbeddingConfig,
|
|
11
10
|
)
|
|
12
11
|
from unstructured_ingest.logger import logger
|
|
13
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
14
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
13
|
from unstructured_ingest.v2.errors import (
|
|
16
14
|
RateLimitError as CustomRateLimitError,
|
|
17
15
|
)
|
|
18
|
-
from unstructured_ingest.v2.errors import
|
|
19
|
-
UserAuthError,
|
|
20
|
-
UserError,
|
|
21
|
-
)
|
|
16
|
+
from unstructured_ingest.v2.errors import UserAuthError, UserError, is_internal_error
|
|
22
17
|
|
|
23
18
|
if TYPE_CHECKING:
|
|
24
19
|
from together import AsyncTogether, Together
|
|
@@ -31,6 +26,8 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
|
31
26
|
)
|
|
32
27
|
|
|
33
28
|
def wrap_error(self, e: Exception) -> Exception:
|
|
29
|
+
if is_internal_error(e=e):
|
|
30
|
+
return e
|
|
34
31
|
# https://docs.together.ai/docs/error-codes
|
|
35
32
|
from together.error import AuthenticationError, RateLimitError, TogetherException
|
|
36
33
|
|
|
@@ -64,31 +61,12 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
64
61
|
def wrap_error(self, e: Exception) -> Exception:
|
|
65
62
|
return self.config.wrap_error(e=e)
|
|
66
63
|
|
|
67
|
-
def
|
|
68
|
-
return self.
|
|
69
|
-
|
|
70
|
-
def
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
74
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
75
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
76
|
-
return elements
|
|
77
|
-
|
|
78
|
-
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
79
|
-
client = self.config.get_client()
|
|
80
|
-
embeddings = []
|
|
81
|
-
try:
|
|
82
|
-
for batch in batch_generator(
|
|
83
|
-
elements, batch_size=self.config.batch_size or len(elements)
|
|
84
|
-
):
|
|
85
|
-
outputs = client.embeddings.create(
|
|
86
|
-
model=self.config.embedder_model_name, input=batch
|
|
87
|
-
)
|
|
88
|
-
embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
|
|
89
|
-
except Exception as e:
|
|
90
|
-
raise self.wrap_error(e=e)
|
|
91
|
-
return embeddings
|
|
64
|
+
def get_client(self) -> "Together":
|
|
65
|
+
return self.config.get_client()
|
|
66
|
+
|
|
67
|
+
def embed_batch(self, client: "Together", batch: list[str]) -> list[list[float]]:
|
|
68
|
+
outputs = client.embeddings.create(model=self.config.embedder_model_name, input=batch)
|
|
69
|
+
return [outputs.data[i].embedding for i in range(len(batch))]
|
|
92
70
|
|
|
93
71
|
|
|
94
72
|
@dataclass
|
|
@@ -98,29 +76,9 @@ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
98
76
|
def wrap_error(self, e: Exception) -> Exception:
|
|
99
77
|
return self.config.wrap_error(e=e)
|
|
100
78
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
elements_with_text = [e for e in elements if e.get("text")]
|
|
108
|
-
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
109
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
110
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
111
|
-
return elements
|
|
112
|
-
|
|
113
|
-
async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
114
|
-
client = self.config.get_async_client()
|
|
115
|
-
embeddings = []
|
|
116
|
-
try:
|
|
117
|
-
for batch in batch_generator(
|
|
118
|
-
elements, batch_size=self.config.batch_size or len(elements)
|
|
119
|
-
):
|
|
120
|
-
outputs = await client.embeddings.create(
|
|
121
|
-
model=self.config.embedder_model_name, input=batch
|
|
122
|
-
)
|
|
123
|
-
embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
|
|
124
|
-
except Exception as e:
|
|
125
|
-
raise self.wrap_error(e=e)
|
|
126
|
-
return embeddings
|
|
79
|
+
def get_client(self) -> "AsyncTogether":
|
|
80
|
+
return self.config.get_async_client()
|
|
81
|
+
|
|
82
|
+
async def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
|
|
83
|
+
outputs = await client.embeddings.create(model=self.config.embedder_model_name, input=batch)
|
|
84
|
+
return [outputs.data[i].embedding for i in range(len(batch))]
|
|
@@ -9,14 +9,12 @@ from pydantic import Field, Secret, ValidationError
|
|
|
9
9
|
from pydantic.functional_validators import BeforeValidator
|
|
10
10
|
|
|
11
11
|
from unstructured_ingest.embed.interfaces import (
|
|
12
|
-
EMBEDDINGS_KEY,
|
|
13
12
|
AsyncBaseEmbeddingEncoder,
|
|
14
13
|
BaseEmbeddingEncoder,
|
|
15
14
|
EmbeddingConfig,
|
|
16
15
|
)
|
|
17
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
18
16
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
|
-
from unstructured_ingest.v2.errors import UserAuthError
|
|
17
|
+
from unstructured_ingest.v2.errors import UserAuthError, is_internal_error
|
|
20
18
|
|
|
21
19
|
if TYPE_CHECKING:
|
|
22
20
|
from vertexai.language_models import TextEmbeddingModel
|
|
@@ -40,6 +38,8 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
|
|
|
40
38
|
)
|
|
41
39
|
|
|
42
40
|
def wrap_error(self, e: Exception) -> Exception:
|
|
41
|
+
if is_internal_error(e=e):
|
|
42
|
+
return e
|
|
43
43
|
from google.auth.exceptions import GoogleAuthError
|
|
44
44
|
|
|
45
45
|
if isinstance(e, GoogleAuthError):
|
|
@@ -72,34 +72,19 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
72
72
|
def wrap_error(self, e: Exception) -> Exception:
|
|
73
73
|
return self.config.wrap_error(e=e)
|
|
74
74
|
|
|
75
|
-
def
|
|
76
|
-
return self.
|
|
77
|
-
|
|
78
|
-
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
79
|
-
elements = elements.copy()
|
|
80
|
-
elements_with_text = [e for e in elements if e.get("text")]
|
|
81
|
-
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
82
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
83
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
84
|
-
return elements
|
|
75
|
+
def get_client(self) -> "TextEmbeddingModel":
|
|
76
|
+
return self.config.get_client()
|
|
85
77
|
|
|
86
78
|
@requires_dependencies(
|
|
87
79
|
["vertexai"],
|
|
88
80
|
extras="embed-vertexai",
|
|
89
81
|
)
|
|
90
|
-
def
|
|
82
|
+
def embed_batch(self, client: "TextEmbeddingModel", batch: list[str]) -> list[list[float]]:
|
|
91
83
|
from vertexai.language_models import TextEmbeddingInput
|
|
92
84
|
|
|
93
|
-
inputs = [TextEmbeddingInput(text=
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
try:
|
|
97
|
-
for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
|
|
98
|
-
response = client.get_embeddings(batch)
|
|
99
|
-
embeddings.extend([e.values for e in response])
|
|
100
|
-
except Exception as e:
|
|
101
|
-
raise self.wrap_error(e=e)
|
|
102
|
-
return embeddings
|
|
85
|
+
inputs = [TextEmbeddingInput(text=text) for text in batch]
|
|
86
|
+
response = client.get_embeddings(inputs)
|
|
87
|
+
return [e.values for e in response]
|
|
103
88
|
|
|
104
89
|
|
|
105
90
|
@dataclass
|
|
@@ -109,32 +94,16 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
109
94
|
def wrap_error(self, e: Exception) -> Exception:
|
|
110
95
|
return self.config.wrap_error(e=e)
|
|
111
96
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
return embedding[0]
|
|
115
|
-
|
|
116
|
-
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
117
|
-
elements = elements.copy()
|
|
118
|
-
elements_with_text = [e for e in elements if e.get("text")]
|
|
119
|
-
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
120
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
121
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
122
|
-
return elements
|
|
97
|
+
def get_client(self) -> "TextEmbeddingModel":
|
|
98
|
+
return self.config.get_client()
|
|
123
99
|
|
|
124
100
|
@requires_dependencies(
|
|
125
101
|
["vertexai"],
|
|
126
102
|
extras="embed-vertexai",
|
|
127
103
|
)
|
|
128
|
-
async def
|
|
104
|
+
async def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
|
|
129
105
|
from vertexai.language_models import TextEmbeddingInput
|
|
130
106
|
|
|
131
|
-
inputs = [TextEmbeddingInput(text=
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
try:
|
|
135
|
-
for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
|
|
136
|
-
response = await client.get_embeddings_async(batch)
|
|
137
|
-
embeddings.extend([e.values for e in response])
|
|
138
|
-
except Exception as e:
|
|
139
|
-
raise self.wrap_error(e=e)
|
|
140
|
-
return embeddings
|
|
107
|
+
inputs = [TextEmbeddingInput(text=text) for text in batch]
|
|
108
|
+
response = await client.get_embeddings_async(inputs)
|
|
109
|
+
return [e.values for e in response]
|
|
@@ -4,19 +4,13 @@ from typing import TYPE_CHECKING, Optional
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
-
EMBEDDINGS_KEY,
|
|
8
7
|
AsyncBaseEmbeddingEncoder,
|
|
9
8
|
BaseEmbeddingEncoder,
|
|
10
9
|
EmbeddingConfig,
|
|
11
10
|
)
|
|
12
11
|
from unstructured_ingest.logger import logger
|
|
13
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
14
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
-
from unstructured_ingest.v2.errors import
|
|
16
|
-
ProviderError,
|
|
17
|
-
UserAuthError,
|
|
18
|
-
UserError,
|
|
19
|
-
)
|
|
13
|
+
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError, is_internal_error
|
|
20
14
|
from unstructured_ingest.v2.errors import (
|
|
21
15
|
RateLimitError as CustomRateLimitError,
|
|
22
16
|
)
|
|
@@ -39,6 +33,8 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
|
39
33
|
timeout_in_seconds: Optional[int] = None
|
|
40
34
|
|
|
41
35
|
def wrap_error(self, e: Exception) -> Exception:
|
|
36
|
+
if is_internal_error(e=e):
|
|
37
|
+
return e
|
|
42
38
|
# https://docs.voyageai.com/docs/error-codes
|
|
43
39
|
from voyageai.error import AuthenticationError, RateLimitError, VoyageError
|
|
44
40
|
|
|
@@ -96,27 +92,12 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
96
92
|
def wrap_error(self, e: Exception) -> Exception:
|
|
97
93
|
return self.config.wrap_error(e=e)
|
|
98
94
|
|
|
99
|
-
def
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
embeddings.extend(response.embeddings)
|
|
106
|
-
except Exception as e:
|
|
107
|
-
raise self.wrap_error(e=e)
|
|
108
|
-
return embeddings
|
|
109
|
-
|
|
110
|
-
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
111
|
-
elements = elements.copy()
|
|
112
|
-
elements_with_text = [e for e in elements if e.get("text")]
|
|
113
|
-
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
114
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
115
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
116
|
-
return elements
|
|
117
|
-
|
|
118
|
-
def embed_query(self, query: str) -> list[float]:
|
|
119
|
-
return self._embed_documents(elements=[query])[0]
|
|
95
|
+
def get_client(self) -> "VoyageAIClient":
|
|
96
|
+
return self.config.get_client()
|
|
97
|
+
|
|
98
|
+
def embed_batch(self, client: "VoyageAIClient", batch: list[str]) -> list[list[float]]:
|
|
99
|
+
response = client.embed(texts=batch, model=self.config.embedder_model_name)
|
|
100
|
+
return response.embeddings
|
|
120
101
|
|
|
121
102
|
|
|
122
103
|
@dataclass
|
|
@@ -126,27 +107,11 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
126
107
|
def wrap_error(self, e: Exception) -> Exception:
|
|
127
108
|
return self.config.wrap_error(e=e)
|
|
128
109
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
embeddings.extend(response.embeddings)
|
|
138
|
-
except Exception as e:
|
|
139
|
-
raise self.wrap_error(e=e)
|
|
140
|
-
return embeddings
|
|
141
|
-
|
|
142
|
-
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
143
|
-
elements = elements.copy()
|
|
144
|
-
elements_with_text = [e for e in elements if e.get("text")]
|
|
145
|
-
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
146
|
-
for element, embedding in zip(elements_with_text, embeddings):
|
|
147
|
-
element[EMBEDDINGS_KEY] = embedding
|
|
148
|
-
return elements
|
|
149
|
-
|
|
150
|
-
async def embed_query(self, query: str) -> list[float]:
|
|
151
|
-
embedding = await self._embed_documents(elements=[query])
|
|
152
|
-
return embedding[0]
|
|
110
|
+
def get_client(self) -> "AsyncVoyageAIClient":
|
|
111
|
+
return self.config.get_async_client()
|
|
112
|
+
|
|
113
|
+
async def embed_batch(
|
|
114
|
+
self, client: "AsyncVoyageAIClient", batch: list[str]
|
|
115
|
+
) -> list[list[float]]:
|
|
116
|
+
response = await client.embed(texts=batch, model=self.config.embedder_model_name)
|
|
117
|
+
return response.embeddings
|
unstructured_ingest/v2/errors.py
CHANGED
|
@@ -16,3 +16,10 @@ class QuotaError(UserError):
|
|
|
16
16
|
|
|
17
17
|
class ProviderError(Exception):
|
|
18
18
|
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
recognized_errors = [UserError, UserAuthError, RateLimitError, QuotaError, ProviderError]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_internal_error(e: Exception) -> bool:
|
|
25
|
+
return any(isinstance(e, recognized_error) for recognized_error in recognized_errors)
|