unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (53) hide show
  1. test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
  2. test/integration/connectors/sql/test_postgres.py +6 -2
  3. test/integration/connectors/sql/test_singlestore.py +6 -2
  4. test/integration/connectors/sql/test_snowflake.py +6 -2
  5. test/integration/connectors/sql/test_sqlite.py +6 -2
  6. test/integration/connectors/test_milvus.py +13 -0
  7. test/integration/connectors/test_onedrive.py +6 -0
  8. test/integration/connectors/test_redis.py +119 -0
  9. test/integration/connectors/test_vectara.py +270 -0
  10. test/integration/embedders/test_bedrock.py +28 -0
  11. test/integration/embedders/test_octoai.py +14 -0
  12. test/integration/embedders/test_openai.py +13 -0
  13. test/integration/embedders/test_togetherai.py +10 -0
  14. test/integration/partitioners/test_partitioner.py +2 -2
  15. test/unit/embed/test_octoai.py +8 -1
  16. unstructured_ingest/__version__.py +1 -1
  17. unstructured_ingest/embed/bedrock.py +39 -11
  18. unstructured_ingest/embed/interfaces.py +5 -0
  19. unstructured_ingest/embed/octoai.py +44 -3
  20. unstructured_ingest/embed/openai.py +37 -1
  21. unstructured_ingest/embed/togetherai.py +28 -1
  22. unstructured_ingest/embed/voyageai.py +33 -1
  23. unstructured_ingest/v2/errors.py +18 -0
  24. unstructured_ingest/v2/interfaces/file_data.py +11 -1
  25. unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
  26. unstructured_ingest/v2/processes/connectors/astradb.py +2 -0
  27. unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
  28. unstructured_ingest/v2/processes/connectors/couchbase.py +2 -0
  29. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
  30. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
  31. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
  32. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
  33. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
  34. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +1 -1
  35. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
  36. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
  37. unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
  38. unstructured_ingest/v2/processes/connectors/mongodb.py +3 -4
  39. unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
  40. unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
  41. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
  42. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  43. unstructured_ingest/v2/processes/connectors/sql/sql.py +5 -0
  44. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  45. unstructured_ingest/v2/unstructured_api.py +25 -2
  46. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +20 -16
  47. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +52 -48
  48. test/integration/connectors/test_kafka.py +0 -304
  49. /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
  50. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
  51. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
  52. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
  53. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,12 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
10
+ from unstructured_ingest.v2.errors import UserAuthError, UserError
8
11
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
12
 
10
13
 
@@ -47,3 +50,28 @@ def test_raw_bedrock_embedder(embedder_file: Path):
47
50
  expected_dimensions=(1536,),
48
51
  expected_is_unit_vector=False,
49
52
  )
53
+
54
+
55
+ def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
56
+ embedder = BedrockEmbeddingEncoder(
57
+ config=BedrockEmbeddingConfig(
58
+ aws_access_key_id="no_key",
59
+ aws_secret_access_key="no_secret",
60
+ )
61
+ )
62
+ with pytest.raises(UserAuthError):
63
+ embedder.get_exemplary_embedding()
64
+
65
+
66
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
67
+ def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
68
+ aws_credentials = get_aws_credentials()
69
+ embedder = BedrockEmbeddingEncoder(
70
+ config=BedrockEmbeddingConfig(
71
+ aws_access_key_id=aws_credentials["aws_access_key_id"],
72
+ aws_secret_access_key=aws_credentials["aws_secret_access_key"],
73
+ model_name="invalid_model",
74
+ )
75
+ )
76
+ with pytest.raises(UserError):
77
+ embedder.get_exemplary_embedding()
@@ -2,9 +2,12 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
10
+ from unstructured_ingest.v2.errors import UserAuthError
8
11
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
12
 
10
13
  API_KEY = "OCTOAI_API_KEY"
@@ -39,3 +42,14 @@ def test_raw_octoai_embedder(embedder_file: Path):
39
42
  validate_raw_embedder(
40
43
  embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
41
44
  )
45
+
46
+
47
+ @pytest.mark.skip(reason="Unexpected connection error at the moment")
48
+ def test_raw_octoai_embedder_invalid_credentials():
49
+ embedder = OctoAIEmbeddingEncoder(
50
+ config=OctoAiEmbeddingConfig(
51
+ api_key="fake_api_key",
52
+ )
53
+ )
54
+ with pytest.raises(UserAuthError):
55
+ embedder.get_exemplary_embedding()
@@ -2,9 +2,12 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
10
+ from unstructured_ingest.v2.errors import UserAuthError
8
11
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
12
 
10
13
  API_KEY = "OPENAI_API_KEY"
@@ -39,3 +42,13 @@ def test_raw_openai_embedder(embedder_file: Path):
39
42
  validate_raw_embedder(
40
43
  embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
41
44
  )
45
+
46
+
47
+ def test_raw_openai_embedder_invalid_credentials():
48
+ embedder = OpenAIEmbeddingEncoder(
49
+ config=OpenAIEmbeddingConfig(
50
+ api_key="fake_api_key",
51
+ )
52
+ )
53
+ with pytest.raises(UserAuthError):
54
+ embedder.get_exemplary_embedding()
@@ -2,12 +2,15 @@ import json
2
2
  import os
3
3
  from pathlib import Path
4
4
 
5
+ import pytest
6
+
5
7
  from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
8
  from test.integration.utils import requires_env
7
9
  from unstructured_ingest.embed.togetherai import (
8
10
  TogetherAIEmbeddingConfig,
9
11
  TogetherAIEmbeddingEncoder,
10
12
  )
13
+ from unstructured_ingest.v2.errors import UserAuthError
11
14
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
12
15
 
13
16
  API_KEY = "TOGETHERAI_API_KEY"
@@ -41,3 +44,10 @@ def test_raw_togetherai_embedder(embedder_file: Path):
41
44
  expected_dimensions=(768,),
42
45
  expected_is_unit_vector=False,
43
46
  )
47
+
48
+
49
+ def test_raw_togetherai_embedder_invalid_credentials():
50
+ embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
51
+
52
+ with pytest.raises(UserAuthError):
53
+ embedder.get_exemplary_embedding()
@@ -3,9 +3,9 @@ import os
3
3
  from pathlib import Path
4
4
 
5
5
  import pytest
6
- from unstructured_client.models.errors.sdkerror import SDKError
7
6
 
8
7
  from test.integration.utils import requires_env
8
+ from unstructured_ingest.v2.errors import UserError
9
9
  from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
10
10
 
11
11
  int_test_dir = Path(__file__).parent
@@ -71,5 +71,5 @@ async def test_partitioner_api_fast_error(partition_file: Path):
71
71
  strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
72
72
  )
73
73
  partitioner = Partitioner(config=partitioner_config)
74
- with pytest.raises(SDKError):
74
+ with pytest.raises(UserError):
75
75
  await partitioner.run_async(filename=partition_file)
@@ -4,7 +4,14 @@ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbedd
4
4
  def test_embed_documents_does_not_break_element_to_dict(mocker):
5
5
  # Mocked client with the desired behavior for embed_documents
6
6
  mock_client = mocker.MagicMock()
7
- mock_client.embed_documents.return_value = [1, 2]
7
+ mock_data = []
8
+ for i in range(2):
9
+ data = mocker.MagicMock()
10
+ data.embedding = [1, 2]
11
+ mock_data.append(data)
12
+ mock_response = mocker.MagicMock()
13
+ mock_response.data = mock_data
14
+ mock_client.embeddings.create.return_value = mock_response
8
15
 
9
16
  # Mock get_client to return our mock_client
10
17
  mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)
@@ -1 +1 @@
1
- __version__ = "0.3.10" # pragma: no cover
1
+ __version__ = "0.3.12" # pragma: no cover
@@ -6,7 +6,9 @@ from typing import TYPE_CHECKING
6
6
  from pydantic import Field, SecretStr
7
7
 
8
8
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
9
+ from unstructured_ingest.logger import logger
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
10
12
 
11
13
  if TYPE_CHECKING:
12
14
  from botocore.client import BaseClient
@@ -44,6 +46,32 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
44
46
  class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
45
47
  config: BedrockEmbeddingConfig
46
48
 
49
+ def wrap_error(self, e: Exception) -> Exception:
50
+ from botocore.exceptions import ClientError
51
+
52
+ if isinstance(e, ClientError):
53
+ # https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
54
+ http_response = e.response
55
+ meta = http_response["ResponseMetadata"]
56
+ http_response_code = meta["HTTPStatusCode"]
57
+ error_code = http_response["Error"]["Code"]
58
+ if http_response_code == 400:
59
+ if error_code == "ValidationError":
60
+ return UserError(http_response["Error"])
61
+ elif error_code == "ThrottlingException":
62
+ return RateLimitError(http_response["Error"])
63
+ elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
64
+ return UserAuthError(http_response["Error"])
65
+ if http_response_code == 403:
66
+ return UserAuthError(http_response["Error"])
67
+ if 400 <= http_response_code < 500:
68
+ return UserError(http_response["Error"])
69
+ if http_response_code >= 500:
70
+ return ProviderError(http_response["Error"])
71
+
72
+ logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
73
+ return e
74
+
47
75
  def embed_query(self, query: str) -> list[float]:
48
76
  """Call out to Bedrock embedding endpoint."""
49
77
  # replace newlines, which can negatively affect performance.
@@ -61,25 +89,25 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
61
89
  input_body["inputText"] = text
62
90
  body = json.dumps(input_body)
63
91
 
92
+ bedrock_client = self.config.get_client()
93
+ # invoke bedrock API
64
94
  try:
65
- bedrock_client = self.config.get_client()
66
- # invoke bedrock API
67
95
  response = bedrock_client.invoke_model(
68
96
  body=body,
69
97
  modelId=self.config.embed_model_name,
70
98
  accept="application/json",
71
99
  contentType="application/json",
72
100
  )
73
-
74
- # format output based on provider
75
- response_body = json.loads(response.get("body").read())
76
- if provider == "cohere":
77
- return response_body.get("embeddings")[0]
78
- else:
79
- # includes common provider == "amazon"
80
- return response_body.get("embedding")
81
101
  except Exception as e:
82
- raise ValueError(f"Error raised by inference endpoint: {e}")
102
+ raise self.wrap_error(e=e)
103
+
104
+ # format output based on provider
105
+ response_body = json.loads(response.get("body").read())
106
+ if provider == "cohere":
107
+ return response_body.get("embeddings")[0]
108
+ else:
109
+ # includes common provider == "amazon"
110
+ return response_body.get("embedding")
83
111
 
84
112
  def embed_documents(self, elements: list[dict]) -> list[dict]:
85
113
  embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
@@ -17,6 +17,11 @@ class BaseEmbeddingEncoder(ABC):
17
17
  """Initializes the embedding encoder class. Should also validate the instance
18
18
  is properly configured: e.g., embed a single a element"""
19
19
 
20
+ def wrap_error(self, e: Exception) -> Exception:
21
+ """Handle errors from the embedding service. Should raise a more informative error
22
+ if possible"""
23
+ return e
24
+
20
25
  @property
21
26
  def num_of_dimensions(self) -> tuple[int, ...]:
22
27
  exemplary_embedding = self.get_exemplary_embedding()
@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
7
+ from unstructured_ingest.logger import logger
7
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.errors import (
10
+ ProviderError,
11
+ QuotaError,
12
+ RateLimitError,
13
+ UserAuthError,
14
+ UserError,
15
+ )
8
16
 
9
17
  if TYPE_CHECKING:
10
18
  from openai import OpenAI
@@ -30,12 +38,45 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
30
38
  class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
31
39
  config: OctoAiEmbeddingConfig
32
40
 
41
+ def wrap_error(self, e: Exception) -> Exception:
42
+ # https://platform.openai.com/docs/guides/error-codes/api-errors
43
+ from openai import APIStatusError
44
+
45
+ if not isinstance(e, APIStatusError):
46
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
47
+ raise e
48
+ error_code = e.code
49
+ if 400 <= e.status_code < 500:
50
+ # user error
51
+ if e.status_code == 401:
52
+ return UserAuthError(e.message)
53
+ if e.status_code == 429:
54
+ # 429 indicates rate limit exceeded and quote exceeded
55
+ if error_code == "insufficient_quota":
56
+ return QuotaError(e.message)
57
+ else:
58
+ return RateLimitError(e.message)
59
+ return UserError(e.message)
60
+ if e.status_code >= 500:
61
+ return ProviderError(e.message)
62
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
63
+ return e
64
+
33
65
  def embed_query(self, query: str):
34
- client = self.config.get_client()
35
- response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
66
+ try:
67
+ client = self.config.get_client()
68
+ response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
69
+ except Exception as e:
70
+ raise self.wrap_error(e=e)
36
71
  return response.data[0].embedding
37
72
 
38
73
  def embed_documents(self, elements: list[dict]) -> list[dict]:
39
- embeddings = [self.embed_query(e.get("text", "")) for e in elements]
74
+ texts = [e.get("text", "") for e in elements]
75
+ try:
76
+ client = self.config.get_client()
77
+ response = client.embeddings.create(input=texts, model=self.config.embedder_model_name)
78
+ except Exception as e:
79
+ raise self.wrap_error(e=e)
80
+ embeddings = [data.embedding for data in response.data]
40
81
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
41
82
  return elements_with_embeddings
@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
7
+ from unstructured_ingest.logger import logger
7
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.errors import (
10
+ ProviderError,
11
+ QuotaError,
12
+ RateLimitError,
13
+ UserAuthError,
14
+ UserError,
15
+ )
8
16
 
9
17
  if TYPE_CHECKING:
10
18
  from openai import OpenAI
@@ -25,9 +33,37 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
25
33
  class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
26
34
  config: OpenAIEmbeddingConfig
27
35
 
36
+ def wrap_error(self, e: Exception) -> Exception:
37
+ # https://platform.openai.com/docs/guides/error-codes/api-errors
38
+ from openai import APIStatusError
39
+
40
+ if not isinstance(e, APIStatusError):
41
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
42
+ raise e
43
+ error_code = e.code
44
+ if 400 <= e.status_code < 500:
45
+ # user error
46
+ if e.status_code == 401:
47
+ return UserAuthError(e.message)
48
+ if e.status_code == 429:
49
+ # 429 indicates rate limit exceeded and quote exceeded
50
+ if error_code == "insufficient_quota":
51
+ return QuotaError(e.message)
52
+ else:
53
+ return RateLimitError(e.message)
54
+ return UserError(e.message)
55
+ if e.status_code >= 500:
56
+ return ProviderError(e.message)
57
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
58
+ return e
59
+
28
60
  def embed_query(self, query: str) -> list[float]:
61
+
29
62
  client = self.config.get_client()
30
- response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
63
+ try:
64
+ response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
65
+ except Exception as e:
66
+ raise self.wrap_error(e=e)
31
67
  return response.data[0].embedding
32
68
 
33
69
  def embed_documents(self, elements: list[dict]) -> list[dict]:
@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
7
+ from unstructured_ingest.logger import logger
7
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.errors import (
10
+ RateLimitError as CustomRateLimitError,
11
+ )
12
+ from unstructured_ingest.v2.errors import (
13
+ UserAuthError,
14
+ UserError,
15
+ )
8
16
 
9
17
  if TYPE_CHECKING:
10
18
  from together import Together
@@ -27,6 +35,20 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
27
35
  class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
28
36
  config: TogetherAIEmbeddingConfig
29
37
 
38
+ def wrap_error(self, e: Exception) -> Exception:
39
+ # https://docs.together.ai/docs/error-codes
40
+ from together.error import AuthenticationError, RateLimitError, TogetherException
41
+
42
+ if not isinstance(e, TogetherException):
43
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
44
+ return e
45
+ message = e.args[0]
46
+ if isinstance(e, AuthenticationError):
47
+ return UserAuthError(message)
48
+ if isinstance(e, RateLimitError):
49
+ return CustomRateLimitError(message)
50
+ return UserError(message)
51
+
30
52
  def embed_query(self, query: str) -> list[float]:
31
53
  return self._embed_documents(elements=[query])[0]
32
54
 
@@ -36,5 +58,10 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
36
58
 
37
59
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
38
60
  client = self.config.get_client()
39
- outputs = client.embeddings.create(model=self.config.embedder_model_name, input=elements)
61
+ try:
62
+ outputs = client.embeddings.create(
63
+ model=self.config.embedder_model_name, input=elements
64
+ )
65
+ except Exception as e:
66
+ raise self.wrap_error(e=e)
40
67
  return [outputs.data[i].embedding for i in range(len(elements))]
@@ -4,7 +4,16 @@ from typing import TYPE_CHECKING, Optional
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
7
+ from unstructured_ingest.logger import logger
7
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.errors import (
10
+ ProviderError,
11
+ UserAuthError,
12
+ UserError,
13
+ )
14
+ from unstructured_ingest.v2.errors import (
15
+ RateLimitError as CustomRateLimitError,
16
+ )
8
17
 
9
18
  if TYPE_CHECKING:
10
19
  from voyageai import Client as VoyageAIClient
@@ -38,9 +47,32 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
38
47
  class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
39
48
  config: VoyageAIEmbeddingConfig
40
49
 
50
+ def wrap_error(self, e: Exception) -> Exception:
51
+ # https://docs.voyageai.com/docs/error-codes
52
+ from voyageai.error import AuthenticationError, RateLimitError, VoyageError
53
+
54
+ if not isinstance(e, VoyageError):
55
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
56
+ raise e
57
+ http_code = e.http_status
58
+ message = e.user_message
59
+ if isinstance(e, AuthenticationError):
60
+ return UserAuthError(message)
61
+ if isinstance(e, RateLimitError):
62
+ return CustomRateLimitError(message)
63
+ if 400 <= http_code < 500:
64
+ return UserError(message)
65
+ if http_code >= 500:
66
+ return ProviderError(message)
67
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
68
+ return e
69
+
41
70
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
42
71
  client: VoyageAIClient = self.config.get_client()
43
- response = client.embed(texts=elements, model=self.config.embedder_model_name)
72
+ try:
73
+ response = client.embed(texts=elements, model=self.config.embedder_model_name)
74
+ except Exception as e:
75
+ self.wrap_error(e=e)
44
76
  return response.embeddings
45
77
 
46
78
  def embed_documents(self, elements: list[dict]) -> list[dict]:
@@ -0,0 +1,18 @@
1
+ class UserError(Exception):
2
+ pass
3
+
4
+
5
+ class UserAuthError(UserError):
6
+ pass
7
+
8
+
9
+ class RateLimitError(UserError):
10
+ pass
11
+
12
+
13
+ class QuotaError(UserError):
14
+ pass
15
+
16
+
17
+ class ProviderError(Exception):
18
+ pass
@@ -36,7 +36,7 @@ class FileDataSourceMetadata(BaseModel):
36
36
  class FileData(BaseModel):
37
37
  identifier: str
38
38
  connector_type: str
39
- source_identifiers: Optional[SourceIdentifiers] = None
39
+ source_identifiers: SourceIdentifiers
40
40
  metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
41
  additional_metadata: dict[str, Any] = Field(default_factory=dict)
42
42
  reprocess: bool = False
@@ -73,6 +73,7 @@ class BatchItem(BaseModel):
73
73
  class BatchFileData(FileData):
74
74
  identifier: str = Field(init=False)
75
75
  batch_items: list[BatchItem]
76
+ source_identifiers: Optional[SourceIdentifiers] = None
76
77
 
77
78
  @field_validator("batch_items")
78
79
  @classmethod
@@ -104,3 +105,12 @@ def file_data_from_file(path: str) -> FileData:
104
105
  logger.debug(f"{path} not valid for batch file data")
105
106
 
106
107
  return FileData.from_file(path=path)
108
+
109
+
110
+ def file_data_from_dict(data: dict) -> FileData:
111
+ try:
112
+ return BatchFileData.model_validate(data)
113
+ except ValidationError:
114
+ logger.debug(f"{data} not valid for batch file data")
115
+
116
+ return FileData.model_validate(data)
@@ -48,12 +48,16 @@ from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
48
48
  from .outlook import outlook_source_entry
49
49
  from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
50
50
  from .pinecone import pinecone_destination_entry
51
+ from .redisdb import CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE
52
+ from .redisdb import redis_destination_entry
51
53
  from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
52
54
  from .salesforce import salesforce_source_entry
53
55
  from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
54
56
  from .sharepoint import sharepoint_source_entry
55
57
  from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
56
58
  from .slack import slack_source_entry
59
+ from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE
60
+ from .vectara import vectara_destination_entry
57
61
 
58
62
  add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
59
63
  add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -101,4 +105,7 @@ add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
101
105
 
102
106
  add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
103
107
 
108
+ add_destination_entry(destination_type=VECTARA_CONNECTOR_TYPE, entry=vectara_destination_entry)
104
109
  add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
110
+
111
+ add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
@@ -30,6 +30,7 @@ from unstructured_ingest.v2.interfaces import (
30
30
  FileDataSourceMetadata,
31
31
  Indexer,
32
32
  IndexerConfig,
33
+ SourceIdentifiers,
33
34
  Uploader,
34
35
  UploaderConfig,
35
36
  UploadStager,
@@ -267,6 +268,7 @@ class AstraDBDownloader(Downloader):
267
268
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
268
269
 
269
270
  # modify input file_data for download_response
271
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
270
272
  cast_file_data = FileData.cast(file_data=file_data)
271
273
  cast_file_data.identifier = filename
272
274
  cast_file_data.metadata.date_processed = str(time())
@@ -138,7 +138,6 @@ class ChromaUploader(Uploader):
138
138
 
139
139
  @DestinationConnectionError.wrap
140
140
  def upsert_batch(self, collection, batch):
141
-
142
141
  try:
143
142
  # Chroma wants lists even if there is only one element
144
143
  # Upserting to prevent duplicates
@@ -27,6 +27,7 @@ from unstructured_ingest.v2.interfaces import (
27
27
  FileDataSourceMetadata,
28
28
  Indexer,
29
29
  IndexerConfig,
30
+ SourceIdentifiers,
30
31
  Uploader,
31
32
  UploaderConfig,
32
33
  UploadStager,
@@ -261,6 +262,7 @@ class CouchbaseDownloader(Downloader):
261
262
  exc_info=True,
262
263
  )
263
264
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
265
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
264
266
  cast_file_data = FileData.cast(file_data=file_data)
265
267
  cast_file_data.identifier = filename_id
266
268
  cast_file_data.metadata.date_processed = str(time.time())
@@ -14,6 +14,7 @@ from unstructured_ingest.error import (
14
14
  )
15
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
16
  from unstructured_ingest.v2.interfaces import (
17
+ AccessConfig,
17
18
  ConnectionConfig,
18
19
  Downloader,
19
20
  DownloaderConfig,
@@ -52,6 +53,10 @@ class DatabricksPathMixin(BaseModel):
52
53
  return path
53
54
 
54
55
 
56
+ class DatabricksVolumesAccessConfig(AccessConfig):
57
+ token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
58
+
59
+
55
60
  class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
56
61
  host: Optional[str] = Field(
57
62
  default=None,
@@ -3,12 +3,12 @@ from typing import Optional
3
3
 
4
4
  from pydantic import Field, Secret
5
5
 
6
- from unstructured_ingest.v2.interfaces import AccessConfig
7
6
  from unstructured_ingest.v2.processes.connector_registry import (
8
7
  DestinationRegistryEntry,
9
8
  SourceRegistryEntry,
10
9
  )
11
10
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
12
  DatabricksVolumesConnectionConfig,
13
13
  DatabricksVolumesDownloader,
14
14
  DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
21
21
  CONNECTOR_TYPE = "databricks_volumes_aws"
22
22
 
23
23
 
24
- class DatabricksAWSVolumesAccessConfig(AccessConfig):
24
+ class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
25
  account_id: Optional[str] = Field(
26
26
  default=None,
27
27
  description="The Databricks account ID for the Databricks " "accounts endpoint",
@@ -3,12 +3,12 @@ from typing import Optional
3
3
 
4
4
  from pydantic import Field, Secret
5
5
 
6
- from unstructured_ingest.v2.interfaces import AccessConfig
7
6
  from unstructured_ingest.v2.processes.connector_registry import (
8
7
  DestinationRegistryEntry,
9
8
  SourceRegistryEntry,
10
9
  )
11
10
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
12
  DatabricksVolumesConnectionConfig,
13
13
  DatabricksVolumesDownloader,
14
14
  DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
21
21
  CONNECTOR_TYPE = "databricks_volumes_azure"
22
22
 
23
23
 
24
- class DatabricksAzureVolumesAccessConfig(AccessConfig):
24
+ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
25
  account_id: Optional[str] = Field(
26
26
  default=None,
27
27
  description="The Databricks account ID for the Databricks " "accounts endpoint.",