unstructured-ingest 0.3.11__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (36) hide show
  1. test/integration/connectors/test_milvus.py +13 -0
  2. test/integration/connectors/test_onedrive.py +6 -0
  3. test/integration/connectors/test_redis.py +119 -0
  4. test/integration/connectors/test_vectara.py +270 -0
  5. test/integration/embedders/test_bedrock.py +28 -0
  6. test/integration/embedders/test_octoai.py +14 -0
  7. test/integration/embedders/test_openai.py +13 -0
  8. test/integration/embedders/test_togetherai.py +10 -0
  9. test/integration/partitioners/test_partitioner.py +2 -2
  10. test/unit/embed/test_octoai.py +8 -1
  11. unstructured_ingest/__version__.py +1 -1
  12. unstructured_ingest/embed/bedrock.py +39 -11
  13. unstructured_ingest/embed/interfaces.py +5 -0
  14. unstructured_ingest/embed/octoai.py +44 -3
  15. unstructured_ingest/embed/openai.py +37 -1
  16. unstructured_ingest/embed/togetherai.py +28 -1
  17. unstructured_ingest/embed/voyageai.py +33 -1
  18. unstructured_ingest/v2/errors.py +18 -0
  19. unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
  20. unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
  21. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
  22. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
  23. unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
  24. unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
  25. unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
  26. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
  27. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  28. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  29. unstructured_ingest/v2/unstructured_api.py +25 -2
  30. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +23 -19
  31. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +35 -31
  32. test/integration/connectors/test_kafka.py +0 -304
  33. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
  34. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
  35. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
  36. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
@@ -174,6 +174,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
174
174
  uploader.precheck()
175
175
 
176
176
 
177
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
178
+ def test_precheck_fails_on_nonexisting_db(collection: str):
179
+ uploader = MilvusUploader(
180
+ connection_config=MilvusConnectionConfig(uri=DB_URI),
181
+ upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
182
+ )
183
+ with pytest.raises(
184
+ DestinationConnectionError,
185
+ match="database not found",
186
+ ):
187
+ uploader.precheck()
188
+
189
+
177
190
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
178
191
  def test_milvus_stager(
179
192
  request: TopRequest,
@@ -20,6 +20,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
20
20
 
21
21
 
22
22
  @pytest.fixture
23
+ @pytest.mark.xfail(
24
+ reason="Issues with test setup on the provider side."
25
+ ) # TODO: remove line when issues are addressed
23
26
  def onedrive_test_folder() -> str:
24
27
  """
25
28
  Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
@@ -66,6 +69,9 @@ def get_connection_config():
66
69
 
67
70
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
71
  @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
72
+ @pytest.mark.xfail(
73
+ reason="Issues with test setup on the provider side."
74
+ ) # TODO: remove line when issues are addressed
69
75
  def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
70
76
  """
71
77
  Integration test for the OneDrive destination connector.
@@ -0,0 +1,119 @@
1
+ import asyncio
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import pytest
9
+ from redis import exceptions as redis_exceptions
10
+ from redis.asyncio import Redis, from_url
11
+
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
13
+ from test.integration.utils import requires_env
14
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
+ from unstructured_ingest.v2.processes.connectors.redisdb import (
16
+ CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
17
+ )
18
+ from unstructured_ingest.v2.processes.connectors.redisdb import (
19
+ RedisAccessConfig,
20
+ RedisConnectionConfig,
21
+ RedisUploader,
22
+ RedisUploaderConfig,
23
+ )
24
+
25
+
26
+ async def delete_record(client: Redis, element_id: str) -> None:
27
+ await client.delete(element_id)
28
+
29
+
30
+ async def validate_upload(client: Redis, first_element: dict):
31
+ element_id = first_element["element_id"]
32
+ expected_text = first_element["text"]
33
+ expected_embeddings = first_element["embeddings"]
34
+ async with client.pipeline(transaction=True) as pipe:
35
+ try:
36
+ response = await pipe.json().get(element_id, "$").execute()
37
+ response = response[0][0]
38
+ except redis_exceptions.ResponseError:
39
+ response = await pipe.get(element_id).execute()
40
+ response = json.loads(response[0])
41
+
42
+ embedding_similarity = np.linalg.norm(
43
+ np.array(response["embeddings"]) - np.array(expected_embeddings)
44
+ )
45
+
46
+ assert response is not None
47
+ assert response["element_id"] == element_id
48
+ assert response["text"] == expected_text
49
+ assert embedding_similarity < 1e-10
50
+
51
+
52
+ async def redis_destination_test(
53
+ upload_file: Path,
54
+ tmp_path: Path,
55
+ connection_kwargs: dict,
56
+ uri: Optional[str] = None,
57
+ password: Optional[str] = None,
58
+ ):
59
+ uploader = RedisUploader(
60
+ connection_config=RedisConnectionConfig(
61
+ **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
62
+ ),
63
+ upload_config=RedisUploaderConfig(batch_size=10),
64
+ )
65
+
66
+ file_data = FileData(
67
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
68
+ connector_type=REDIS_CONNECTOR_TYPE,
69
+ identifier="mock-file-data",
70
+ )
71
+ with upload_file.open() as upload_fp:
72
+ elements = json.load(upload_fp)
73
+ first_element = elements[0]
74
+
75
+ try:
76
+ if uploader.is_async():
77
+ await uploader.run_data_async(data=elements, file_data=file_data)
78
+
79
+ if uri:
80
+ async with from_url(uri) as client:
81
+ await validate_upload(client=client, first_element=first_element)
82
+ else:
83
+ async with Redis(**connection_kwargs, password=password) as client:
84
+ await validate_upload(client=client, first_element=first_element)
85
+ except Exception as e:
86
+ raise e
87
+ finally:
88
+ if uri:
89
+ async with from_url(uri) as client:
90
+ tasks = [delete_record(client, element["element_id"]) for element in elements]
91
+ await asyncio.gather(*tasks)
92
+ else:
93
+ async with Redis(**connection_kwargs, password=password) as client:
94
+ tasks = [delete_record(client, element["element_id"]) for element in elements]
95
+ await asyncio.gather(*tasks)
96
+
97
+
98
+ @pytest.mark.asyncio
99
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
100
+ @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
101
+ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
102
+ connection_kwargs = {
103
+ "host": "utic-dashboard-dev.redis.cache.windows.net",
104
+ "port": 6380,
105
+ "db": 0,
106
+ "ssl": True,
107
+ }
108
+ redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
109
+ await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
110
+
111
+
112
+ @pytest.mark.asyncio
113
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
114
+ @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
115
+ async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
116
+ connection_kwargs = {}
117
+ redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
118
+ uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
119
+ await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
@@ -0,0 +1,270 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Generator
6
+ from uuid import uuid4
7
+
8
+ import pytest
9
+ import requests
10
+
11
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
14
+ from unstructured_ingest.v2.logger import logger
15
+ from unstructured_ingest.v2.processes.connectors.vectara import (
16
+ CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
17
+ )
18
+ from unstructured_ingest.v2.processes.connectors.vectara import (
19
+ VectaraAccessConfig,
20
+ VectaraConnectionConfig,
21
+ VectaraUploader,
22
+ VectaraUploaderConfig,
23
+ VectaraUploadStager,
24
+ VectaraUploadStagerConfig,
25
+ )
26
+
27
+
28
+ def validate_upload(response: dict, expected_data: dict):
29
+ element_id = expected_data["element_id"]
30
+ expected_text = expected_data["text"]
31
+ filename = expected_data["metadata"]["filename"]
32
+ filetype = expected_data["metadata"]["filetype"]
33
+ page_number = expected_data["metadata"]["page_number"]
34
+
35
+ response = response["search_results"][0]
36
+
37
+ assert response is not None
38
+ assert response["text"] == expected_text
39
+ assert response["part_metadata"]["element_id"] == element_id
40
+ assert response["part_metadata"]["filename"] == filename
41
+ assert response["part_metadata"]["filetype"] == filetype
42
+ assert response["part_metadata"]["page_number"] == page_number
43
+
44
+
45
+ @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
46
+ def _get_jwt_token():
47
+ """Connect to the server and get a JWT token."""
48
+ customer_id = os.environ["VECTARA_CUSTOMER_ID"]
49
+ token_endpoint = (
50
+ f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
51
+ )
52
+ headers = {
53
+ "Content-Type": "application/x-www-form-urlencoded",
54
+ }
55
+ data = {
56
+ "grant_type": "client_credentials",
57
+ "client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
58
+ "client_secret": os.environ["VECTARA_OAUTH_SECRET"],
59
+ }
60
+
61
+ response = requests.post(token_endpoint, headers=headers, data=data)
62
+ response.raise_for_status()
63
+ response_json = response.json()
64
+
65
+ return response_json.get("access_token")
66
+
67
+
68
+ def query_data(corpus_key: str, element_id: str) -> dict:
69
+
70
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
71
+
72
+ # the query below requires the corpus to have filter attributes for element_id
73
+
74
+ data = json.dumps(
75
+ {
76
+ "query": "string",
77
+ "search": {
78
+ "metadata_filter": f"part.element_id = '{element_id}'",
79
+ "lexical_interpolation": 1,
80
+ "limit": 10,
81
+ },
82
+ }
83
+ )
84
+
85
+ jwt_token = _get_jwt_token()
86
+ headers = {
87
+ "Content-Type": "application/json",
88
+ "Accept": "application/json",
89
+ "Authorization": f"Bearer {jwt_token}",
90
+ "X-source": "unstructured",
91
+ }
92
+
93
+ response = requests.post(url, headers=headers, data=data)
94
+ response.raise_for_status()
95
+ response_json = response.json()
96
+
97
+ return response_json
98
+
99
+
100
+ def create_corpora(corpus_key: str, corpus_name: str) -> None:
101
+ url = "https://api.vectara.io/v2/corpora"
102
+ data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
103
+ jwt_token = _get_jwt_token()
104
+ headers = {
105
+ "Content-Type": "application/json",
106
+ "Accept": "application/json",
107
+ "Authorization": f"Bearer {jwt_token}",
108
+ "X-source": "unstructured",
109
+ }
110
+
111
+ response = requests.post(url, headers=headers, data=data)
112
+ response.raise_for_status()
113
+
114
+
115
+ def replace_filter_attributes(corpus_key: str) -> None:
116
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
117
+ data = json.dumps(
118
+ {
119
+ "filter_attributes": [
120
+ {"name": "element_id", "level": "part", "indexed": True, "type": "text"}
121
+ ]
122
+ }
123
+ )
124
+ jwt_token = _get_jwt_token()
125
+ headers = {
126
+ "Content-Type": "application/json",
127
+ "Accept": "application/json",
128
+ "Authorization": f"Bearer {jwt_token}",
129
+ "X-source": "unstructured",
130
+ }
131
+
132
+ response = requests.post(url, headers=headers, data=data)
133
+ response.raise_for_status()
134
+
135
+
136
+ def delete_corpora(corpus_key: str) -> None:
137
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
138
+
139
+ jwt_token = _get_jwt_token()
140
+ headers = {
141
+ "Content-Type": "application/json",
142
+ "Accept": "application/json",
143
+ "Authorization": f"Bearer {jwt_token}",
144
+ "X-source": "unstructured",
145
+ }
146
+
147
+ response = requests.delete(url, headers=headers)
148
+ response.raise_for_status()
149
+
150
+
151
+ def list_corpora() -> list:
152
+ url = "https://api.vectara.io/v2/corpora?limit=100"
153
+ jwt_token = _get_jwt_token()
154
+ headers = {
155
+ "Content-Type": "application/json",
156
+ "Accept": "application/json",
157
+ "Authorization": f"Bearer {jwt_token}",
158
+ "X-source": "unstructured",
159
+ }
160
+ response = requests.get(url, headers=headers)
161
+ response.raise_for_status()
162
+ response_json = response.json()
163
+ if response_json.get("corpora"):
164
+ return [item["key"] for item in response_json.get("corpora")]
165
+ else:
166
+ return []
167
+
168
+
169
+ def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
170
+ def is_ready_status():
171
+ corpora_list = list_corpora()
172
+ return corpus_key in corpora_list
173
+
174
+ start = time.time()
175
+ is_ready = is_ready_status()
176
+ while not is_ready and time.time() - start < timeout:
177
+ time.sleep(interval)
178
+ is_ready = is_ready_status()
179
+ if not is_ready:
180
+ raise TimeoutError("time out waiting for corpus to be ready")
181
+
182
+
183
+ def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
184
+ start = time.time()
185
+ while time.time() - start < timeout:
186
+ corpora_list = list_corpora()
187
+ if corpus_key not in corpora_list:
188
+ return
189
+ time.sleep(interval)
190
+
191
+ raise TimeoutError("time out waiting for corpus to delete")
192
+
193
+
194
+ @pytest.fixture
195
+ def corpora_util() -> Generator[str, None, None]:
196
+ random_id = str(uuid4()).split("-")[0]
197
+ corpus_key = f"ingest-test-{random_id}"
198
+ corpus_name = "ingest-test"
199
+ logger.info(f"Creating corpus with key: {corpus_key}")
200
+ try:
201
+ create_corpora(corpus_key, corpus_name)
202
+ replace_filter_attributes(corpus_key)
203
+ wait_for_ready(corpus_key=corpus_key)
204
+ yield corpus_key
205
+ except Exception as e:
206
+ logger.error(f"failed to create corpus {corpus_key}: {e}")
207
+ finally:
208
+ logger.info(f"deleting corpus: {corpus_key}")
209
+ delete_corpora(corpus_key)
210
+ wait_for_delete(corpus_key=corpus_key)
211
+
212
+
213
+ @pytest.mark.asyncio
214
+ @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
215
+ @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216
+ async def test_vectara_destination(
217
+ upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
218
+ ):
219
+ corpus_key = corpora_util
220
+ connection_kwargs = {
221
+ "customer_id": os.environ["VECTARA_CUSTOMER_ID"],
222
+ "corpus_key": corpus_key,
223
+ }
224
+
225
+ oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
226
+ oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
227
+
228
+ file_data = FileData(
229
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
230
+ connector_type=VECTARA_CONNECTOR_TYPE,
231
+ identifier="mock-file-data",
232
+ )
233
+
234
+ stager_config = VectaraUploadStagerConfig(batch_size=10)
235
+ stager = VectaraUploadStager(upload_stager_config=stager_config)
236
+ new_upload_file = stager.run(
237
+ elements_filepath=upload_file,
238
+ output_dir=tmp_path,
239
+ output_filename=upload_file.name,
240
+ file_data=file_data,
241
+ )
242
+
243
+ uploader = VectaraUploader(
244
+ connection_config=VectaraConnectionConfig(
245
+ **connection_kwargs,
246
+ access_config=VectaraAccessConfig(
247
+ oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
248
+ ),
249
+ ),
250
+ upload_config=VectaraUploaderConfig(),
251
+ )
252
+
253
+ with new_upload_file.open() as new_upload_fp:
254
+ elements_stager = json.load(new_upload_fp)
255
+
256
+ if uploader.is_async():
257
+ await uploader.run_data_async(data=elements_stager, file_data=file_data)
258
+
259
+ with upload_file.open() as upload_fp:
260
+ elements = json.load(upload_fp)
261
+ first_element = elements[0]
262
+
263
+ for i in range(retries):
264
+ response = query_data(corpus_key, first_element["element_id"])
265
+ if not response["search_results"]:
266
+ time.sleep(interval)
267
+ else:
268
+ break
269
+
270
+ validate_upload(response=response, expected_data=first_element)
@@ -2,9 +2,12 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
10
+ from unstructured_ingest.v2.errors import UserAuthError, UserError
8
11
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
12
 
10
13
 
@@ -47,3 +50,28 @@ def test_raw_bedrock_embedder(embedder_file: Path):
47
50
  expected_dimensions=(1536,),
48
51
  expected_is_unit_vector=False,
49
52
  )
53
+
54
+
55
+ def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
56
+ embedder = BedrockEmbeddingEncoder(
57
+ config=BedrockEmbeddingConfig(
58
+ aws_access_key_id="no_key",
59
+ aws_secret_access_key="no_secret",
60
+ )
61
+ )
62
+ with pytest.raises(UserAuthError):
63
+ embedder.get_exemplary_embedding()
64
+
65
+
66
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
67
+ def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
68
+ aws_credentials = get_aws_credentials()
69
+ embedder = BedrockEmbeddingEncoder(
70
+ config=BedrockEmbeddingConfig(
71
+ aws_access_key_id=aws_credentials["aws_access_key_id"],
72
+ aws_secret_access_key=aws_credentials["aws_secret_access_key"],
73
+ model_name="invalid_model",
74
+ )
75
+ )
76
+ with pytest.raises(UserError):
77
+ embedder.get_exemplary_embedding()
@@ -2,9 +2,12 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
10
+ from unstructured_ingest.v2.errors import UserAuthError
8
11
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
12
 
10
13
  API_KEY = "OCTOAI_API_KEY"
@@ -39,3 +42,14 @@ def test_raw_octoai_embedder(embedder_file: Path):
39
42
  validate_raw_embedder(
40
43
  embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
41
44
  )
45
+
46
+
47
+ @pytest.mark.skip(reason="Unexpected connection error at the moment")
48
+ def test_raw_octoai_embedder_invalid_credentials():
49
+ embedder = OctoAIEmbeddingEncoder(
50
+ config=OctoAiEmbeddingConfig(
51
+ api_key="fake_api_key",
52
+ )
53
+ )
54
+ with pytest.raises(UserAuthError):
55
+ embedder.get_exemplary_embedding()
@@ -2,9 +2,12 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
10
+ from unstructured_ingest.v2.errors import UserAuthError
8
11
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
12
 
10
13
  API_KEY = "OPENAI_API_KEY"
@@ -39,3 +42,13 @@ def test_raw_openai_embedder(embedder_file: Path):
39
42
  validate_raw_embedder(
40
43
  embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
41
44
  )
45
+
46
+
47
+ def test_raw_openai_embedder_invalid_credentials():
48
+ embedder = OpenAIEmbeddingEncoder(
49
+ config=OpenAIEmbeddingConfig(
50
+ api_key="fake_api_key",
51
+ )
52
+ )
53
+ with pytest.raises(UserAuthError):
54
+ embedder.get_exemplary_embedding()
@@ -2,12 +2,15 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.togetherai import (
8
10
  TogetherAIEmbeddingConfig,
9
11
  TogetherAIEmbeddingEncoder,
10
12
  )
13
+ from unstructured_ingest.v2.errors import UserAuthError
11
14
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
12
15
 
13
16
  API_KEY = "TOGETHERAI_API_KEY"
@@ -41,3 +44,10 @@ def test_raw_togetherai_embedder(embedder_file: Path):
41
44
  expected_dimensions=(768,),
42
45
  expected_is_unit_vector=False,
43
46
  )
47
+
48
+
49
+ def test_raw_togetherai_embedder_invalid_credentials():
50
+ embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
51
+
52
+ with pytest.raises(UserAuthError):
53
+ embedder.get_exemplary_embedding()
@@ -3,9 +3,9 @@ import os
3
3
  from pathlib import Path
4
4
 
5
5
  import pytest
6
- from unstructured_client.models.errors.sdkerror import SDKError
7
6
 
8
7
  from test.integration.utils import requires_env
8
+ from unstructured_ingest.v2.errors import UserError
9
9
  from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
10
10
 
11
11
  int_test_dir = Path(__file__).parent
@@ -71,5 +71,5 @@ async def test_partitioner_api_fast_error(partition_file: Path):
71
71
  strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
72
72
  )
73
73
  partitioner = Partitioner(config=partitioner_config)
74
- with pytest.raises(SDKError):
74
+ with pytest.raises(UserError):
75
75
  await partitioner.run_async(filename=partition_file)
@@ -4,7 +4,14 @@ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbedd
4
4
  def test_embed_documents_does_not_break_element_to_dict(mocker):
5
5
  # Mocked client with the desired behavior for embed_documents
6
6
  mock_client = mocker.MagicMock()
7
- mock_client.embed_documents.return_value = [1, 2]
7
+ mock_data = []
8
+ for i in range(2):
9
+ data = mocker.MagicMock()
10
+ data.embedding = [1, 2]
11
+ mock_data.append(data)
12
+ mock_response = mocker.MagicMock()
13
+ mock_response.data = mock_data
14
+ mock_client.embeddings.create.return_value = mock_response
8
15
 
9
16
  # Mock get_client to return our mock_client
10
17
  mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)
@@ -1 +1 @@
1
- __version__ = "0.3.11" # pragma: no cover
1
+ __version__ = "0.3.12" # pragma: no cover
@@ -6,7 +6,9 @@ from typing import TYPE_CHECKING
6
6
  from pydantic import Field, SecretStr
7
7
 
8
8
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
9
+ from unstructured_ingest.logger import logger
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
10
12
 
11
13
  if TYPE_CHECKING:
12
14
  from botocore.client import BaseClient
@@ -44,6 +46,32 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
44
46
  class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
45
47
  config: BedrockEmbeddingConfig
46
48
 
49
+ def wrap_error(self, e: Exception) -> Exception:
50
+ from botocore.exceptions import ClientError
51
+
52
+ if isinstance(e, ClientError):
53
+ # https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
54
+ http_response = e.response
55
+ meta = http_response["ResponseMetadata"]
56
+ http_response_code = meta["HTTPStatusCode"]
57
+ error_code = http_response["Error"]["Code"]
58
+ if http_response_code == 400:
59
+ if error_code == "ValidationError":
60
+ return UserError(http_response["Error"])
61
+ elif error_code == "ThrottlingException":
62
+ return RateLimitError(http_response["Error"])
63
+ elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
64
+ return UserAuthError(http_response["Error"])
65
+ if http_response_code == 403:
66
+ return UserAuthError(http_response["Error"])
67
+ if 400 <= http_response_code < 500:
68
+ return UserError(http_response["Error"])
69
+ if http_response_code >= 500:
70
+ return ProviderError(http_response["Error"])
71
+
72
+ logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
73
+ return e
74
+
47
75
  def embed_query(self, query: str) -> list[float]:
48
76
  """Call out to Bedrock embedding endpoint."""
49
77
  # replace newlines, which can negatively affect performance.
@@ -61,25 +89,25 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
61
89
  input_body["inputText"] = text
62
90
  body = json.dumps(input_body)
63
91
 
92
+ bedrock_client = self.config.get_client()
93
+ # invoke bedrock API
64
94
  try:
65
- bedrock_client = self.config.get_client()
66
- # invoke bedrock API
67
95
  response = bedrock_client.invoke_model(
68
96
  body=body,
69
97
  modelId=self.config.embed_model_name,
70
98
  accept="application/json",
71
99
  contentType="application/json",
72
100
  )
73
-
74
- # format output based on provider
75
- response_body = json.loads(response.get("body").read())
76
- if provider == "cohere":
77
- return response_body.get("embeddings")[0]
78
- else:
79
- # includes common provider == "amazon"
80
- return response_body.get("embedding")
81
101
  except Exception as e:
82
- raise ValueError(f"Error raised by inference endpoint: {e}")
102
+ raise self.wrap_error(e=e)
103
+
104
+ # format output based on provider
105
+ response_body = json.loads(response.get("body").read())
106
+ if provider == "cohere":
107
+ return response_body.get("embeddings")[0]
108
+ else:
109
+ # includes common provider == "amazon"
110
+ return response_body.get("embedding")
83
111
 
84
112
  def embed_documents(self, elements: list[dict]) -> list[dict]:
85
113
  embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
@@ -17,6 +17,11 @@ class BaseEmbeddingEncoder(ABC):
17
17
  """Initializes the embedding encoder class. Should also validate the instance
18
18
  is properly configured: e.g., embed a single a element"""
19
19
 
20
+ def wrap_error(self, e: Exception) -> Exception:
21
+ """Handle errors from the embedding service. Should raise a more informative error
22
+ if possible"""
23
+ return e
24
+
20
25
  @property
21
26
  def num_of_dimensions(self) -> tuple[int, ...]:
22
27
  exemplary_embedding = self.get_exemplary_embedding()