unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
- test/integration/connectors/sql/test_postgres.py +6 -2
- test/integration/connectors/sql/test_singlestore.py +6 -2
- test/integration/connectors/sql/test_snowflake.py +6 -2
- test/integration/connectors/sql/test_sqlite.py +6 -2
- test/integration/connectors/test_milvus.py +13 -0
- test/integration/connectors/test_onedrive.py +6 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/embedders/test_bedrock.py +28 -0
- test/integration/embedders/test_octoai.py +14 -0
- test/integration/embedders/test_openai.py +13 -0
- test/integration/embedders/test_togetherai.py +10 -0
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/embed/test_octoai.py +8 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +39 -11
- unstructured_ingest/embed/interfaces.py +5 -0
- unstructured_ingest/embed/octoai.py +44 -3
- unstructured_ingest/embed/openai.py +37 -1
- unstructured_ingest/embed/togetherai.py +28 -1
- unstructured_ingest/embed/voyageai.py +33 -1
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/file_data.py +11 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
- unstructured_ingest/v2/processes/connectors/couchbase.py +2 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +3 -4
- unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +5 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/unstructured_api.py +25 -2
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +20 -16
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +52 -48
- test/integration/connectors/test_kafka.py +0 -304
- /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
|
@@ -2,9 +2,12 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
10
|
+
from unstructured_ingest.v2.errors import UserAuthError, UserError
|
|
8
11
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
12
|
|
|
10
13
|
|
|
@@ -47,3 +50,28 @@ def test_raw_bedrock_embedder(embedder_file: Path):
|
|
|
47
50
|
expected_dimensions=(1536,),
|
|
48
51
|
expected_is_unit_vector=False,
|
|
49
52
|
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
|
|
56
|
+
embedder = BedrockEmbeddingEncoder(
|
|
57
|
+
config=BedrockEmbeddingConfig(
|
|
58
|
+
aws_access_key_id="no_key",
|
|
59
|
+
aws_secret_access_key="no_secret",
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
with pytest.raises(UserAuthError):
|
|
63
|
+
embedder.get_exemplary_embedding()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
67
|
+
def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
|
|
68
|
+
aws_credentials = get_aws_credentials()
|
|
69
|
+
embedder = BedrockEmbeddingEncoder(
|
|
70
|
+
config=BedrockEmbeddingConfig(
|
|
71
|
+
aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
72
|
+
aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
73
|
+
model_name="invalid_model",
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
with pytest.raises(UserError):
|
|
77
|
+
embedder.get_exemplary_embedding()
|
|
@@ -2,9 +2,12 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
10
|
+
from unstructured_ingest.v2.errors import UserAuthError
|
|
8
11
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
12
|
|
|
10
13
|
API_KEY = "OCTOAI_API_KEY"
|
|
@@ -39,3 +42,14 @@ def test_raw_octoai_embedder(embedder_file: Path):
|
|
|
39
42
|
validate_raw_embedder(
|
|
40
43
|
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
41
44
|
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@pytest.mark.skip(reason="Unexpected connection error at the moment")
|
|
48
|
+
def test_raw_octoai_embedder_invalid_credentials():
|
|
49
|
+
embedder = OctoAIEmbeddingEncoder(
|
|
50
|
+
config=OctoAiEmbeddingConfig(
|
|
51
|
+
api_key="fake_api_key",
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
with pytest.raises(UserAuthError):
|
|
55
|
+
embedder.get_exemplary_embedding()
|
|
@@ -2,9 +2,12 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
10
|
+
from unstructured_ingest.v2.errors import UserAuthError
|
|
8
11
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
12
|
|
|
10
13
|
API_KEY = "OPENAI_API_KEY"
|
|
@@ -39,3 +42,13 @@ def test_raw_openai_embedder(embedder_file: Path):
|
|
|
39
42
|
validate_raw_embedder(
|
|
40
43
|
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
41
44
|
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_raw_openai_embedder_invalid_credentials():
|
|
48
|
+
embedder = OpenAIEmbeddingEncoder(
|
|
49
|
+
config=OpenAIEmbeddingConfig(
|
|
50
|
+
api_key="fake_api_key",
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
with pytest.raises(UserAuthError):
|
|
54
|
+
embedder.get_exemplary_embedding()
|
|
@@ -2,12 +2,15 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.togetherai import (
|
|
8
10
|
TogetherAIEmbeddingConfig,
|
|
9
11
|
TogetherAIEmbeddingEncoder,
|
|
10
12
|
)
|
|
13
|
+
from unstructured_ingest.v2.errors import UserAuthError
|
|
11
14
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
12
15
|
|
|
13
16
|
API_KEY = "TOGETHERAI_API_KEY"
|
|
@@ -41,3 +44,10 @@ def test_raw_togetherai_embedder(embedder_file: Path):
|
|
|
41
44
|
expected_dimensions=(768,),
|
|
42
45
|
expected_is_unit_vector=False,
|
|
43
46
|
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_raw_togetherai_embedder_invalid_credentials():
|
|
50
|
+
embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
|
|
51
|
+
|
|
52
|
+
with pytest.raises(UserAuthError):
|
|
53
|
+
embedder.get_exemplary_embedding()
|
|
@@ -3,9 +3,9 @@ import os
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
|
-
from unstructured_client.models.errors.sdkerror import SDKError
|
|
7
6
|
|
|
8
7
|
from test.integration.utils import requires_env
|
|
8
|
+
from unstructured_ingest.v2.errors import UserError
|
|
9
9
|
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
10
10
|
|
|
11
11
|
int_test_dir = Path(__file__).parent
|
|
@@ -71,5 +71,5 @@ async def test_partitioner_api_fast_error(partition_file: Path):
|
|
|
71
71
|
strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
|
|
72
72
|
)
|
|
73
73
|
partitioner = Partitioner(config=partitioner_config)
|
|
74
|
-
with pytest.raises(
|
|
74
|
+
with pytest.raises(UserError):
|
|
75
75
|
await partitioner.run_async(filename=partition_file)
|
test/unit/embed/test_octoai.py
CHANGED
|
@@ -4,7 +4,14 @@ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbedd
|
|
|
4
4
|
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
5
|
# Mocked client with the desired behavior for embed_documents
|
|
6
6
|
mock_client = mocker.MagicMock()
|
|
7
|
-
|
|
7
|
+
mock_data = []
|
|
8
|
+
for i in range(2):
|
|
9
|
+
data = mocker.MagicMock()
|
|
10
|
+
data.embedding = [1, 2]
|
|
11
|
+
mock_data.append(data)
|
|
12
|
+
mock_response = mocker.MagicMock()
|
|
13
|
+
mock_response.data = mock_data
|
|
14
|
+
mock_client.embeddings.create.return_value = mock_response
|
|
8
15
|
|
|
9
16
|
# Mock get_client to return our mock_client
|
|
10
17
|
mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.12" # pragma: no cover
|
|
@@ -6,7 +6,9 @@ from typing import TYPE_CHECKING
|
|
|
6
6
|
from pydantic import Field, SecretStr
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
9
|
+
from unstructured_ingest.logger import logger
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
|
|
10
12
|
|
|
11
13
|
if TYPE_CHECKING:
|
|
12
14
|
from botocore.client import BaseClient
|
|
@@ -44,6 +46,32 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
|
44
46
|
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
45
47
|
config: BedrockEmbeddingConfig
|
|
46
48
|
|
|
49
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
50
|
+
from botocore.exceptions import ClientError
|
|
51
|
+
|
|
52
|
+
if isinstance(e, ClientError):
|
|
53
|
+
# https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
|
|
54
|
+
http_response = e.response
|
|
55
|
+
meta = http_response["ResponseMetadata"]
|
|
56
|
+
http_response_code = meta["HTTPStatusCode"]
|
|
57
|
+
error_code = http_response["Error"]["Code"]
|
|
58
|
+
if http_response_code == 400:
|
|
59
|
+
if error_code == "ValidationError":
|
|
60
|
+
return UserError(http_response["Error"])
|
|
61
|
+
elif error_code == "ThrottlingException":
|
|
62
|
+
return RateLimitError(http_response["Error"])
|
|
63
|
+
elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
|
|
64
|
+
return UserAuthError(http_response["Error"])
|
|
65
|
+
if http_response_code == 403:
|
|
66
|
+
return UserAuthError(http_response["Error"])
|
|
67
|
+
if 400 <= http_response_code < 500:
|
|
68
|
+
return UserError(http_response["Error"])
|
|
69
|
+
if http_response_code >= 500:
|
|
70
|
+
return ProviderError(http_response["Error"])
|
|
71
|
+
|
|
72
|
+
logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
|
|
73
|
+
return e
|
|
74
|
+
|
|
47
75
|
def embed_query(self, query: str) -> list[float]:
|
|
48
76
|
"""Call out to Bedrock embedding endpoint."""
|
|
49
77
|
# replace newlines, which can negatively affect performance.
|
|
@@ -61,25 +89,25 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
61
89
|
input_body["inputText"] = text
|
|
62
90
|
body = json.dumps(input_body)
|
|
63
91
|
|
|
92
|
+
bedrock_client = self.config.get_client()
|
|
93
|
+
# invoke bedrock API
|
|
64
94
|
try:
|
|
65
|
-
bedrock_client = self.config.get_client()
|
|
66
|
-
# invoke bedrock API
|
|
67
95
|
response = bedrock_client.invoke_model(
|
|
68
96
|
body=body,
|
|
69
97
|
modelId=self.config.embed_model_name,
|
|
70
98
|
accept="application/json",
|
|
71
99
|
contentType="application/json",
|
|
72
100
|
)
|
|
73
|
-
|
|
74
|
-
# format output based on provider
|
|
75
|
-
response_body = json.loads(response.get("body").read())
|
|
76
|
-
if provider == "cohere":
|
|
77
|
-
return response_body.get("embeddings")[0]
|
|
78
|
-
else:
|
|
79
|
-
# includes common provider == "amazon"
|
|
80
|
-
return response_body.get("embedding")
|
|
81
101
|
except Exception as e:
|
|
82
|
-
raise
|
|
102
|
+
raise self.wrap_error(e=e)
|
|
103
|
+
|
|
104
|
+
# format output based on provider
|
|
105
|
+
response_body = json.loads(response.get("body").read())
|
|
106
|
+
if provider == "cohere":
|
|
107
|
+
return response_body.get("embeddings")[0]
|
|
108
|
+
else:
|
|
109
|
+
# includes common provider == "amazon"
|
|
110
|
+
return response_body.get("embedding")
|
|
83
111
|
|
|
84
112
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
85
113
|
embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
|
|
@@ -17,6 +17,11 @@ class BaseEmbeddingEncoder(ABC):
|
|
|
17
17
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
18
18
|
is properly configured: e.g., embed a single a element"""
|
|
19
19
|
|
|
20
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
21
|
+
"""Handle errors from the embedding service. Should raise a more informative error
|
|
22
|
+
if possible"""
|
|
23
|
+
return e
|
|
24
|
+
|
|
20
25
|
@property
|
|
21
26
|
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
22
27
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
7
|
+
from unstructured_ingest.logger import logger
|
|
7
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.errors import (
|
|
10
|
+
ProviderError,
|
|
11
|
+
QuotaError,
|
|
12
|
+
RateLimitError,
|
|
13
|
+
UserAuthError,
|
|
14
|
+
UserError,
|
|
15
|
+
)
|
|
8
16
|
|
|
9
17
|
if TYPE_CHECKING:
|
|
10
18
|
from openai import OpenAI
|
|
@@ -30,12 +38,45 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
|
30
38
|
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
31
39
|
config: OctoAiEmbeddingConfig
|
|
32
40
|
|
|
41
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
42
|
+
# https://platform.openai.com/docs/guides/error-codes/api-errors
|
|
43
|
+
from openai import APIStatusError
|
|
44
|
+
|
|
45
|
+
if not isinstance(e, APIStatusError):
|
|
46
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
47
|
+
raise e
|
|
48
|
+
error_code = e.code
|
|
49
|
+
if 400 <= e.status_code < 500:
|
|
50
|
+
# user error
|
|
51
|
+
if e.status_code == 401:
|
|
52
|
+
return UserAuthError(e.message)
|
|
53
|
+
if e.status_code == 429:
|
|
54
|
+
# 429 indicates rate limit exceeded and quote exceeded
|
|
55
|
+
if error_code == "insufficient_quota":
|
|
56
|
+
return QuotaError(e.message)
|
|
57
|
+
else:
|
|
58
|
+
return RateLimitError(e.message)
|
|
59
|
+
return UserError(e.message)
|
|
60
|
+
if e.status_code >= 500:
|
|
61
|
+
return ProviderError(e.message)
|
|
62
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
63
|
+
return e
|
|
64
|
+
|
|
33
65
|
def embed_query(self, query: str):
|
|
34
|
-
|
|
35
|
-
|
|
66
|
+
try:
|
|
67
|
+
client = self.config.get_client()
|
|
68
|
+
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
raise self.wrap_error(e=e)
|
|
36
71
|
return response.data[0].embedding
|
|
37
72
|
|
|
38
73
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
39
|
-
|
|
74
|
+
texts = [e.get("text", "") for e in elements]
|
|
75
|
+
try:
|
|
76
|
+
client = self.config.get_client()
|
|
77
|
+
response = client.embeddings.create(input=texts, model=self.config.embedder_model_name)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
raise self.wrap_error(e=e)
|
|
80
|
+
embeddings = [data.embedding for data in response.data]
|
|
40
81
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
41
82
|
return elements_with_embeddings
|
|
@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
7
|
+
from unstructured_ingest.logger import logger
|
|
7
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.errors import (
|
|
10
|
+
ProviderError,
|
|
11
|
+
QuotaError,
|
|
12
|
+
RateLimitError,
|
|
13
|
+
UserAuthError,
|
|
14
|
+
UserError,
|
|
15
|
+
)
|
|
8
16
|
|
|
9
17
|
if TYPE_CHECKING:
|
|
10
18
|
from openai import OpenAI
|
|
@@ -25,9 +33,37 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
25
33
|
class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
26
34
|
config: OpenAIEmbeddingConfig
|
|
27
35
|
|
|
36
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
37
|
+
# https://platform.openai.com/docs/guides/error-codes/api-errors
|
|
38
|
+
from openai import APIStatusError
|
|
39
|
+
|
|
40
|
+
if not isinstance(e, APIStatusError):
|
|
41
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
42
|
+
raise e
|
|
43
|
+
error_code = e.code
|
|
44
|
+
if 400 <= e.status_code < 500:
|
|
45
|
+
# user error
|
|
46
|
+
if e.status_code == 401:
|
|
47
|
+
return UserAuthError(e.message)
|
|
48
|
+
if e.status_code == 429:
|
|
49
|
+
# 429 indicates rate limit exceeded and quote exceeded
|
|
50
|
+
if error_code == "insufficient_quota":
|
|
51
|
+
return QuotaError(e.message)
|
|
52
|
+
else:
|
|
53
|
+
return RateLimitError(e.message)
|
|
54
|
+
return UserError(e.message)
|
|
55
|
+
if e.status_code >= 500:
|
|
56
|
+
return ProviderError(e.message)
|
|
57
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
58
|
+
return e
|
|
59
|
+
|
|
28
60
|
def embed_query(self, query: str) -> list[float]:
|
|
61
|
+
|
|
29
62
|
client = self.config.get_client()
|
|
30
|
-
|
|
63
|
+
try:
|
|
64
|
+
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
raise self.wrap_error(e=e)
|
|
31
67
|
return response.data[0].embedding
|
|
32
68
|
|
|
33
69
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
7
|
+
from unstructured_ingest.logger import logger
|
|
7
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.errors import (
|
|
10
|
+
RateLimitError as CustomRateLimitError,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.v2.errors import (
|
|
13
|
+
UserAuthError,
|
|
14
|
+
UserError,
|
|
15
|
+
)
|
|
8
16
|
|
|
9
17
|
if TYPE_CHECKING:
|
|
10
18
|
from together import Together
|
|
@@ -27,6 +35,20 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
|
27
35
|
class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
28
36
|
config: TogetherAIEmbeddingConfig
|
|
29
37
|
|
|
38
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
39
|
+
# https://docs.together.ai/docs/error-codes
|
|
40
|
+
from together.error import AuthenticationError, RateLimitError, TogetherException
|
|
41
|
+
|
|
42
|
+
if not isinstance(e, TogetherException):
|
|
43
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
44
|
+
return e
|
|
45
|
+
message = e.args[0]
|
|
46
|
+
if isinstance(e, AuthenticationError):
|
|
47
|
+
return UserAuthError(message)
|
|
48
|
+
if isinstance(e, RateLimitError):
|
|
49
|
+
return CustomRateLimitError(message)
|
|
50
|
+
return UserError(message)
|
|
51
|
+
|
|
30
52
|
def embed_query(self, query: str) -> list[float]:
|
|
31
53
|
return self._embed_documents(elements=[query])[0]
|
|
32
54
|
|
|
@@ -36,5 +58,10 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
36
58
|
|
|
37
59
|
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
38
60
|
client = self.config.get_client()
|
|
39
|
-
|
|
61
|
+
try:
|
|
62
|
+
outputs = client.embeddings.create(
|
|
63
|
+
model=self.config.embedder_model_name, input=elements
|
|
64
|
+
)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
raise self.wrap_error(e=e)
|
|
40
67
|
return [outputs.data[i].embedding for i in range(len(elements))]
|
|
@@ -4,7 +4,16 @@ from typing import TYPE_CHECKING, Optional
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
7
|
+
from unstructured_ingest.logger import logger
|
|
7
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.errors import (
|
|
10
|
+
ProviderError,
|
|
11
|
+
UserAuthError,
|
|
12
|
+
UserError,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.v2.errors import (
|
|
15
|
+
RateLimitError as CustomRateLimitError,
|
|
16
|
+
)
|
|
8
17
|
|
|
9
18
|
if TYPE_CHECKING:
|
|
10
19
|
from voyageai import Client as VoyageAIClient
|
|
@@ -38,9 +47,32 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
|
38
47
|
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
39
48
|
config: VoyageAIEmbeddingConfig
|
|
40
49
|
|
|
50
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
51
|
+
# https://docs.voyageai.com/docs/error-codes
|
|
52
|
+
from voyageai.error import AuthenticationError, RateLimitError, VoyageError
|
|
53
|
+
|
|
54
|
+
if not isinstance(e, VoyageError):
|
|
55
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
56
|
+
raise e
|
|
57
|
+
http_code = e.http_status
|
|
58
|
+
message = e.user_message
|
|
59
|
+
if isinstance(e, AuthenticationError):
|
|
60
|
+
return UserAuthError(message)
|
|
61
|
+
if isinstance(e, RateLimitError):
|
|
62
|
+
return CustomRateLimitError(message)
|
|
63
|
+
if 400 <= http_code < 500:
|
|
64
|
+
return UserError(message)
|
|
65
|
+
if http_code >= 500:
|
|
66
|
+
return ProviderError(message)
|
|
67
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
68
|
+
return e
|
|
69
|
+
|
|
41
70
|
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
42
71
|
client: VoyageAIClient = self.config.get_client()
|
|
43
|
-
|
|
72
|
+
try:
|
|
73
|
+
response = client.embed(texts=elements, model=self.config.embedder_model_name)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
self.wrap_error(e=e)
|
|
44
76
|
return response.embeddings
|
|
45
77
|
|
|
46
78
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
@@ -36,7 +36,7 @@ class FileDataSourceMetadata(BaseModel):
|
|
|
36
36
|
class FileData(BaseModel):
|
|
37
37
|
identifier: str
|
|
38
38
|
connector_type: str
|
|
39
|
-
source_identifiers:
|
|
39
|
+
source_identifiers: SourceIdentifiers
|
|
40
40
|
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
41
|
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
42
42
|
reprocess: bool = False
|
|
@@ -73,6 +73,7 @@ class BatchItem(BaseModel):
|
|
|
73
73
|
class BatchFileData(FileData):
|
|
74
74
|
identifier: str = Field(init=False)
|
|
75
75
|
batch_items: list[BatchItem]
|
|
76
|
+
source_identifiers: Optional[SourceIdentifiers] = None
|
|
76
77
|
|
|
77
78
|
@field_validator("batch_items")
|
|
78
79
|
@classmethod
|
|
@@ -104,3 +105,12 @@ def file_data_from_file(path: str) -> FileData:
|
|
|
104
105
|
logger.debug(f"{path} not valid for batch file data")
|
|
105
106
|
|
|
106
107
|
return FileData.from_file(path=path)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def file_data_from_dict(data: dict) -> FileData:
|
|
111
|
+
try:
|
|
112
|
+
return BatchFileData.model_validate(data)
|
|
113
|
+
except ValidationError:
|
|
114
|
+
logger.debug(f"{data} not valid for batch file data")
|
|
115
|
+
|
|
116
|
+
return FileData.model_validate(data)
|
|
@@ -48,12 +48,16 @@ from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
|
48
48
|
from .outlook import outlook_source_entry
|
|
49
49
|
from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
|
|
50
50
|
from .pinecone import pinecone_destination_entry
|
|
51
|
+
from .redisdb import CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE
|
|
52
|
+
from .redisdb import redis_destination_entry
|
|
51
53
|
from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
|
|
52
54
|
from .salesforce import salesforce_source_entry
|
|
53
55
|
from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
54
56
|
from .sharepoint import sharepoint_source_entry
|
|
55
57
|
from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
|
|
56
58
|
from .slack import slack_source_entry
|
|
59
|
+
from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE
|
|
60
|
+
from .vectara import vectara_destination_entry
|
|
57
61
|
|
|
58
62
|
add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
|
|
59
63
|
add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
|
|
@@ -101,4 +105,7 @@ add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
|
|
|
101
105
|
|
|
102
106
|
add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
|
|
103
107
|
|
|
108
|
+
add_destination_entry(destination_type=VECTARA_CONNECTOR_TYPE, entry=vectara_destination_entry)
|
|
104
109
|
add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
|
|
110
|
+
|
|
111
|
+
add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
|
|
@@ -30,6 +30,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
30
30
|
FileDataSourceMetadata,
|
|
31
31
|
Indexer,
|
|
32
32
|
IndexerConfig,
|
|
33
|
+
SourceIdentifiers,
|
|
33
34
|
Uploader,
|
|
34
35
|
UploaderConfig,
|
|
35
36
|
UploadStager,
|
|
@@ -267,6 +268,7 @@ class AstraDBDownloader(Downloader):
|
|
|
267
268
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
268
269
|
|
|
269
270
|
# modify input file_data for download_response
|
|
271
|
+
file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
270
272
|
cast_file_data = FileData.cast(file_data=file_data)
|
|
271
273
|
cast_file_data.identifier = filename
|
|
272
274
|
cast_file_data.metadata.date_processed = str(time())
|
|
@@ -27,6 +27,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
27
27
|
FileDataSourceMetadata,
|
|
28
28
|
Indexer,
|
|
29
29
|
IndexerConfig,
|
|
30
|
+
SourceIdentifiers,
|
|
30
31
|
Uploader,
|
|
31
32
|
UploaderConfig,
|
|
32
33
|
UploadStager,
|
|
@@ -261,6 +262,7 @@ class CouchbaseDownloader(Downloader):
|
|
|
261
262
|
exc_info=True,
|
|
262
263
|
)
|
|
263
264
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
265
|
+
file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
264
266
|
cast_file_data = FileData.cast(file_data=file_data)
|
|
265
267
|
cast_file_data.identifier = filename_id
|
|
266
268
|
cast_file_data.metadata.date_processed = str(time.time())
|
|
@@ -14,6 +14,7 @@ from unstructured_ingest.error import (
|
|
|
14
14
|
)
|
|
15
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
16
|
from unstructured_ingest.v2.interfaces import (
|
|
17
|
+
AccessConfig,
|
|
17
18
|
ConnectionConfig,
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
@@ -52,6 +53,10 @@ class DatabricksPathMixin(BaseModel):
|
|
|
52
53
|
return path
|
|
53
54
|
|
|
54
55
|
|
|
56
|
+
class DatabricksVolumesAccessConfig(AccessConfig):
|
|
57
|
+
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
|
|
58
|
+
|
|
59
|
+
|
|
55
60
|
class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
56
61
|
host: Optional[str] = Field(
|
|
57
62
|
default=None,
|
|
@@ -3,12 +3,12 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, Secret
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
6
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
7
|
DestinationRegistryEntry,
|
|
9
8
|
SourceRegistryEntry,
|
|
10
9
|
)
|
|
11
10
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
12
|
DatabricksVolumesConnectionConfig,
|
|
13
13
|
DatabricksVolumesDownloader,
|
|
14
14
|
DatabricksVolumesDownloaderConfig,
|
|
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
21
21
|
CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class DatabricksAWSVolumesAccessConfig(
|
|
24
|
+
class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
25
|
account_id: Optional[str] = Field(
|
|
26
26
|
default=None,
|
|
27
27
|
description="The Databricks account ID for the Databricks " "accounts endpoint",
|
|
@@ -3,12 +3,12 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, Secret
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
6
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
7
|
DestinationRegistryEntry,
|
|
9
8
|
SourceRegistryEntry,
|
|
10
9
|
)
|
|
11
10
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
12
|
DatabricksVolumesConnectionConfig,
|
|
13
13
|
DatabricksVolumesDownloader,
|
|
14
14
|
DatabricksVolumesDownloaderConfig,
|
|
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
21
21
|
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class DatabricksAzureVolumesAccessConfig(
|
|
24
|
+
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
25
|
account_id: Optional[str] = Field(
|
|
26
26
|
default=None,
|
|
27
27
|
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|