PyPI - unstructured-ingest - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

unstructured-ingest 0.3.11py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (36) hide show

test/integration/connectors/test_milvus.py CHANGED Viewed

@@ -174,6 +174,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
         uploader.precheck()
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+def test_precheck_fails_on_nonexisting_db(collection: str):
+    uploader = MilvusUploader(
+        connection_config=MilvusConnectionConfig(uri=DB_URI),
+        upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
+    )
+    with pytest.raises(
+        DestinationConnectionError,
+        match="database not found",
+    ):
+        uploader.precheck()
 @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
 def test_milvus_stager(
     request: TopRequest,

test/integration/connectors/test_onedrive.py CHANGED Viewed

@@ -20,6 +20,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
 @pytest.fixture
+@pytest.mark.xfail(
+    reason="Issues with test setup on the provider side."
+)  # TODO: remove line when issues are addressed
 def onedrive_test_folder() -> str:
     """
     Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
@@ -66,6 +69,9 @@ def get_connection_config():
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
 @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
+@pytest.mark.xfail(
+    reason="Issues with test setup on the provider side."
+)  # TODO: remove line when issues are addressed
 def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
     """
     Integration test for the OneDrive destination connector.

test/integration/connectors/test_redis.py ADDED Viewed

@@ -0,0 +1,119 @@
+import asyncio
+import json
+import os
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pytest
+from redis import exceptions as redis_exceptions
+from redis.asyncio import Redis, from_url
+from test.integration.connectors.utils.constants import DESTINATION_TAG
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.redisdb import (
+    CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
+)
+from unstructured_ingest.v2.processes.connectors.redisdb import (
+    RedisAccessConfig,
+    RedisConnectionConfig,
+    RedisUploader,
+    RedisUploaderConfig,
+)
+async def delete_record(client: Redis, element_id: str) -> None:
+    await client.delete(element_id)
+async def validate_upload(client: Redis, first_element: dict):
+    element_id = first_element["element_id"]
+    expected_text = first_element["text"]
+    expected_embeddings = first_element["embeddings"]
+    async with client.pipeline(transaction=True) as pipe:
+        try:
+            response = await pipe.json().get(element_id, "$").execute()
+            response = response[0][0]
+        except redis_exceptions.ResponseError:
+            response = await pipe.get(element_id).execute()
+            response = json.loads(response[0])
+    embedding_similarity = np.linalg.norm(
+        np.array(response["embeddings"]) - np.array(expected_embeddings)
+    )
+    assert response is not None
+    assert response["element_id"] == element_id
+    assert response["text"] == expected_text
+    assert embedding_similarity < 1e-10
+async def redis_destination_test(
+    upload_file: Path,
+    tmp_path: Path,
+    connection_kwargs: dict,
+    uri: Optional[str] = None,
+    password: Optional[str] = None,
+):
+    uploader = RedisUploader(
+        connection_config=RedisConnectionConfig(
+            **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
+        ),
+        upload_config=RedisUploaderConfig(batch_size=10),
+    )
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=REDIS_CONNECTOR_TYPE,
+        identifier="mock-file-data",
+    )
+    with upload_file.open() as upload_fp:
+        elements = json.load(upload_fp)
+    first_element = elements[0]
+    try:
+        if uploader.is_async():
+            await uploader.run_data_async(data=elements, file_data=file_data)
+        if uri:
+            async with from_url(uri) as client:
+                await validate_upload(client=client, first_element=first_element)
+        else:
+            async with Redis(**connection_kwargs, password=password) as client:
+                await validate_upload(client=client, first_element=first_element)
+    except Exception as e:
+        raise e
+    finally:
+        if uri:
+            async with from_url(uri) as client:
+                tasks = [delete_record(client, element["element_id"]) for element in elements]
+                await asyncio.gather(*tasks)
+        else:
+            async with Redis(**connection_kwargs, password=password) as client:
+                tasks = [delete_record(client, element["element_id"]) for element in elements]
+                await asyncio.gather(*tasks)
+@pytest.mark.asyncio
+@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
+async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
+    connection_kwargs = {
+        "host": "utic-dashboard-dev.redis.cache.windows.net",
+        "port": 6380,
+        "db": 0,
+        "ssl": True,
+    }
+    redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
+    await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
+@pytest.mark.asyncio
+@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
+@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
+async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
+    connection_kwargs = {}
+    redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
+    uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
+    await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)

test/integration/connectors/test_vectara.py ADDED Viewed

@@ -0,0 +1,270 @@
+import json
+import os
+import time
+from pathlib import Path
+from typing import Generator
+from uuid import uuid4
+import pytest
+import requests
+from test.integration.connectors.utils.constants import DESTINATION_TAG
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connectors.vectara import (
+    CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
+)
+from unstructured_ingest.v2.processes.connectors.vectara import (
+    VectaraAccessConfig,
+    VectaraConnectionConfig,
+    VectaraUploader,
+    VectaraUploaderConfig,
+    VectaraUploadStager,
+    VectaraUploadStagerConfig,
+)
+def validate_upload(response: dict, expected_data: dict):
+    element_id = expected_data["element_id"]
+    expected_text = expected_data["text"]
+    filename = expected_data["metadata"]["filename"]
+    filetype = expected_data["metadata"]["filetype"]
+    page_number = expected_data["metadata"]["page_number"]
+    response = response["search_results"][0]
+    assert response is not None
+    assert response["text"] == expected_text
+    assert response["part_metadata"]["element_id"] == element_id
+    assert response["part_metadata"]["filename"] == filename
+    assert response["part_metadata"]["filetype"] == filetype
+    assert response["part_metadata"]["page_number"] == page_number
+@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
+def _get_jwt_token():
+    """Connect to the server and get a JWT token."""
+    customer_id = os.environ["VECTARA_CUSTOMER_ID"]
+    token_endpoint = (
+        f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
+    )
+    headers = {
+        "Content-Type": "application/x-www-form-urlencoded",
+    }
+    data = {
+        "grant_type": "client_credentials",
+        "client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
+        "client_secret": os.environ["VECTARA_OAUTH_SECRET"],
+    }
+    response = requests.post(token_endpoint, headers=headers, data=data)
+    response.raise_for_status()
+    response_json = response.json()
+    return response_json.get("access_token")
+def query_data(corpus_key: str, element_id: str) -> dict:
+    url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
+    # the query below requires the corpus to have filter attributes for element_id
+    data = json.dumps(
+        {
+            "query": "string",
+            "search": {
+                "metadata_filter": f"part.element_id = '{element_id}'",
+                "lexical_interpolation": 1,
+                "limit": 10,
+            },
+        }
+    )
+    jwt_token = _get_jwt_token()
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Authorization": f"Bearer {jwt_token}",
+        "X-source": "unstructured",
+    }
+    response = requests.post(url, headers=headers, data=data)
+    response.raise_for_status()
+    response_json = response.json()
+    return response_json
+def create_corpora(corpus_key: str, corpus_name: str) -> None:
+    url = "https://api.vectara.io/v2/corpora"
+    data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
+    jwt_token = _get_jwt_token()
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Authorization": f"Bearer {jwt_token}",
+        "X-source": "unstructured",
+    }
+    response = requests.post(url, headers=headers, data=data)
+    response.raise_for_status()
+def replace_filter_attributes(corpus_key: str) -> None:
+    url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
+    data = json.dumps(
+        {
+            "filter_attributes": [
+                {"name": "element_id", "level": "part", "indexed": True, "type": "text"}
+            ]
+        }
+    )
+    jwt_token = _get_jwt_token()
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Authorization": f"Bearer {jwt_token}",
+        "X-source": "unstructured",
+    }
+    response = requests.post(url, headers=headers, data=data)
+    response.raise_for_status()
+def delete_corpora(corpus_key: str) -> None:
+    url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
+    jwt_token = _get_jwt_token()
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Authorization": f"Bearer {jwt_token}",
+        "X-source": "unstructured",
+    }
+    response = requests.delete(url, headers=headers)
+    response.raise_for_status()
+def list_corpora() -> list:
+    url = "https://api.vectara.io/v2/corpora?limit=100"
+    jwt_token = _get_jwt_token()
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Authorization": f"Bearer {jwt_token}",
+        "X-source": "unstructured",
+    }
+    response = requests.get(url, headers=headers)
+    response.raise_for_status()
+    response_json = response.json()
+    if response_json.get("corpora"):
+        return [item["key"] for item in response_json.get("corpora")]
+    else:
+        return []
+def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
+    def is_ready_status():
+        corpora_list = list_corpora()
+        return corpus_key in corpora_list
+    start = time.time()
+    is_ready = is_ready_status()
+    while not is_ready and time.time() - start < timeout:
+        time.sleep(interval)
+        is_ready = is_ready_status()
+    if not is_ready:
+        raise TimeoutError("time out waiting for corpus to be ready")
+def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
+    start = time.time()
+    while time.time() - start < timeout:
+        corpora_list = list_corpora()
+        if corpus_key not in corpora_list:
+            return
+        time.sleep(interval)
+    raise TimeoutError("time out waiting for corpus to delete")
+@pytest.fixture
+def corpora_util() -> Generator[str, None, None]:
+    random_id = str(uuid4()).split("-")[0]
+    corpus_key = f"ingest-test-{random_id}"
+    corpus_name = "ingest-test"
+    logger.info(f"Creating corpus with key: {corpus_key}")
+    try:
+        create_corpora(corpus_key, corpus_name)
+        replace_filter_attributes(corpus_key)
+        wait_for_ready(corpus_key=corpus_key)
+        yield corpus_key
+    except Exception as e:
+        logger.error(f"failed to create corpus {corpus_key}: {e}")
+    finally:
+        logger.info(f"deleting corpus: {corpus_key}")
+        delete_corpora(corpus_key)
+        wait_for_delete(corpus_key=corpus_key)
+@pytest.mark.asyncio
+@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
+@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
+async def test_vectara_destination(
+    upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
+):
+    corpus_key = corpora_util
+    connection_kwargs = {
+        "customer_id": os.environ["VECTARA_CUSTOMER_ID"],
+        "corpus_key": corpus_key,
+    }
+    oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
+    oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=VECTARA_CONNECTOR_TYPE,
+        identifier="mock-file-data",
+    )
+    stager_config = VectaraUploadStagerConfig(batch_size=10)
+    stager = VectaraUploadStager(upload_stager_config=stager_config)
+    new_upload_file = stager.run(
+        elements_filepath=upload_file,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+        file_data=file_data,
+    )
+    uploader = VectaraUploader(
+        connection_config=VectaraConnectionConfig(
+            **connection_kwargs,
+            access_config=VectaraAccessConfig(
+                oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
+            ),
+        ),
+        upload_config=VectaraUploaderConfig(),
+    )
+    with new_upload_file.open() as new_upload_fp:
+        elements_stager = json.load(new_upload_fp)
+    if uploader.is_async():
+        await uploader.run_data_async(data=elements_stager, file_data=file_data)
+    with upload_file.open() as upload_fp:
+        elements = json.load(upload_fp)
+    first_element = elements[0]
+    for i in range(retries):
+        response = query_data(corpus_key, first_element["element_id"])
+        if not response["search_results"]:
+            time.sleep(interval)
+        else:
+            break
+    validate_upload(response=response, expected_data=first_element)

test/integration/embedders/test_bedrock.py CHANGED Viewed

@@ -2,9 +2,12 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
+from unstructured_ingest.v2.errors import UserAuthError, UserError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
@@ -47,3 +50,28 @@ def test_raw_bedrock_embedder(embedder_file: Path):
         expected_dimensions=(1536,),
         expected_is_unit_vector=False,
     )
+def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
+    embedder = BedrockEmbeddingEncoder(
+        config=BedrockEmbeddingConfig(
+            aws_access_key_id="no_key",
+            aws_secret_access_key="no_secret",
+        )
+    )
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()
+@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
+def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
+    aws_credentials = get_aws_credentials()
+    embedder = BedrockEmbeddingEncoder(
+        config=BedrockEmbeddingConfig(
+            aws_access_key_id=aws_credentials["aws_access_key_id"],
+            aws_secret_access_key=aws_credentials["aws_secret_access_key"],
+            model_name="invalid_model",
+        )
+    )
+    with pytest.raises(UserError):
+        embedder.get_exemplary_embedding()

test/integration/embedders/test_octoai.py CHANGED Viewed

@@ -2,9 +2,12 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
+from unstructured_ingest.v2.errors import UserAuthError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
 API_KEY = "OCTOAI_API_KEY"
@@ -39,3 +42,14 @@ def test_raw_octoai_embedder(embedder_file: Path):
     validate_raw_embedder(
         embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
     )
+@pytest.mark.skip(reason="Unexpected connection error at the moment")
+def test_raw_octoai_embedder_invalid_credentials():
+    embedder = OctoAIEmbeddingEncoder(
+        config=OctoAiEmbeddingConfig(
+            api_key="fake_api_key",
+        )
+    )
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()

test/integration/embedders/test_openai.py CHANGED Viewed

@@ -2,9 +2,12 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
+from unstructured_ingest.v2.errors import UserAuthError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
 API_KEY = "OPENAI_API_KEY"
@@ -39,3 +42,13 @@ def test_raw_openai_embedder(embedder_file: Path):
     validate_raw_embedder(
         embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
     )
+def test_raw_openai_embedder_invalid_credentials():
+    embedder = OpenAIEmbeddingEncoder(
+        config=OpenAIEmbeddingConfig(
+            api_key="fake_api_key",
+        )
+    )
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()

test/integration/embedders/test_togetherai.py CHANGED Viewed

@@ -2,12 +2,15 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.togetherai import (
     TogetherAIEmbeddingConfig,
     TogetherAIEmbeddingEncoder,
 )
+from unstructured_ingest.v2.errors import UserAuthError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
 API_KEY = "TOGETHERAI_API_KEY"
@@ -41,3 +44,10 @@ def test_raw_togetherai_embedder(embedder_file: Path):
         expected_dimensions=(768,),
         expected_is_unit_vector=False,
     )
+def test_raw_togetherai_embedder_invalid_credentials():
+    embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()

test/integration/partitioners/test_partitioner.py CHANGED Viewed

@@ -3,9 +3,9 @@ import os
 from pathlib import Path
 import pytest
-from unstructured_client.models.errors.sdkerror import SDKError
 from test.integration.utils import requires_env
+from unstructured_ingest.v2.errors import UserError
 from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
 int_test_dir = Path(__file__).parent
@@ -71,5 +71,5 @@ async def test_partitioner_api_fast_error(partition_file: Path):
         strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
     )
     partitioner = Partitioner(config=partitioner_config)
-    with pytest.raises(SDKError):
+    with pytest.raises(UserError):
         await partitioner.run_async(filename=partition_file)

test/unit/embed/test_octoai.py CHANGED Viewed

@@ -4,7 +4,14 @@ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbedd
 def test_embed_documents_does_not_break_element_to_dict(mocker):
     # Mocked client with the desired behavior for embed_documents
     mock_client = mocker.MagicMock()
-    mock_client.embed_documents.return_value = [1, 2]
+    mock_data = []
+    for i in range(2):
+        data = mocker.MagicMock()
+        data.embedding = [1, 2]
+        mock_data.append(data)
+    mock_response = mocker.MagicMock()
+    mock_response.data = mock_data
+    mock_client.embeddings.create.return_value = mock_response
     # Mock get_client to return our mock_client
     mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.11" # pragma: no cover
1	+ __version__ = "0.3.12" # pragma: no cover

unstructured_ingest/embed/bedrock.py CHANGED Viewed

@@ -6,7 +6,9 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
 if TYPE_CHECKING:
     from botocore.client import BaseClient
@@ -44,6 +46,32 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
 class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
     config: BedrockEmbeddingConfig
+    def wrap_error(self, e: Exception) -> Exception:
+        from botocore.exceptions import ClientError
+        if isinstance(e, ClientError):
+            # https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
+            http_response = e.response
+            meta = http_response["ResponseMetadata"]
+            http_response_code = meta["HTTPStatusCode"]
+            error_code = http_response["Error"]["Code"]
+            if http_response_code == 400:
+                if error_code == "ValidationError":
+                    return UserError(http_response["Error"])
+                elif error_code == "ThrottlingException":
+                    return RateLimitError(http_response["Error"])
+                elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
+                    return UserAuthError(http_response["Error"])
+            if http_response_code == 403:
+                return UserAuthError(http_response["Error"])
+            if 400 <= http_response_code < 500:
+                return UserError(http_response["Error"])
+            if http_response_code >= 500:
+                return ProviderError(http_response["Error"])
+        logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
+        return e
     def embed_query(self, query: str) -> list[float]:
         """Call out to Bedrock embedding endpoint."""
         # replace newlines, which can negatively affect performance.
@@ -61,25 +89,25 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
             input_body["inputText"] = text
         body = json.dumps(input_body)
+        bedrock_client = self.config.get_client()
+        # invoke bedrock API
         try:
-            bedrock_client = self.config.get_client()
-            # invoke bedrock API
             response = bedrock_client.invoke_model(
                 body=body,
                 modelId=self.config.embed_model_name,
                 accept="application/json",
                 contentType="application/json",
             )
-            # format output based on provider
-            response_body = json.loads(response.get("body").read())
-            if provider == "cohere":
-                return response_body.get("embeddings")[0]
-            else:
-                # includes common provider == "amazon"
-                return response_body.get("embedding")
         except Exception as e:
-            raise ValueError(f"Error raised by inference endpoint: {e}")
+            raise self.wrap_error(e=e)
+        # format output based on provider
+        response_body = json.loads(response.get("body").read())
+        if provider == "cohere":
+            return response_body.get("embeddings")[0]
+        else:
+            # includes common provider == "amazon"
+            return response_body.get("embedding")
     def embed_documents(self, elements: list[dict]) -> list[dict]:
         embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]

unstructured_ingest/embed/interfaces.py CHANGED Viewed

@@ -17,6 +17,11 @@ class BaseEmbeddingEncoder(ABC):
         """Initializes the embedding encoder class. Should also validate the instance
         is properly configured: e.g., embed a single a element"""
+    def wrap_error(self, e: Exception) -> Exception:
+        """Handle errors from the embedding service. Should raise a more informative error
+        if possible"""
+        return e
     @property
     def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()

unstructured-ingest 0.3.11__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.11py3-none-any.whl → 0.3.12py3-none-any.whl