PyPI - unstructured-ingest - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

unstructured-ingest 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (53) hide show

test/integration/embedders/test_bedrock.py CHANGED Viewed

@@ -2,9 +2,12 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
+from unstructured_ingest.v2.errors import UserAuthError, UserError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
@@ -47,3 +50,28 @@ def test_raw_bedrock_embedder(embedder_file: Path):
         expected_dimensions=(1536,),
         expected_is_unit_vector=False,
     )
+def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
+    embedder = BedrockEmbeddingEncoder(
+        config=BedrockEmbeddingConfig(
+            aws_access_key_id="no_key",
+            aws_secret_access_key="no_secret",
+        )
+    )
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()
+@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
+def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
+    aws_credentials = get_aws_credentials()
+    embedder = BedrockEmbeddingEncoder(
+        config=BedrockEmbeddingConfig(
+            aws_access_key_id=aws_credentials["aws_access_key_id"],
+            aws_secret_access_key=aws_credentials["aws_secret_access_key"],
+            model_name="invalid_model",
+        )
+    )
+    with pytest.raises(UserError):
+        embedder.get_exemplary_embedding()

test/integration/embedders/test_octoai.py CHANGED Viewed

@@ -2,9 +2,12 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
+from unstructured_ingest.v2.errors import UserAuthError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
 API_KEY = "OCTOAI_API_KEY"
@@ -39,3 +42,14 @@ def test_raw_octoai_embedder(embedder_file: Path):
     validate_raw_embedder(
         embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
     )
+@pytest.mark.skip(reason="Unexpected connection error at the moment")
+def test_raw_octoai_embedder_invalid_credentials():
+    embedder = OctoAIEmbeddingEncoder(
+        config=OctoAiEmbeddingConfig(
+            api_key="fake_api_key",
+        )
+    )
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()

test/integration/embedders/test_openai.py CHANGED Viewed

@@ -2,9 +2,12 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
+from unstructured_ingest.v2.errors import UserAuthError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
 API_KEY = "OPENAI_API_KEY"
@@ -39,3 +42,13 @@ def test_raw_openai_embedder(embedder_file: Path):
     validate_raw_embedder(
         embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
     )
+def test_raw_openai_embedder_invalid_credentials():
+    embedder = OpenAIEmbeddingEncoder(
+        config=OpenAIEmbeddingConfig(
+            api_key="fake_api_key",
+        )
+    )
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()

test/integration/embedders/test_togetherai.py CHANGED Viewed

@@ -2,12 +2,15 @@ import json
 import os
 from pathlib import Path
+import pytest
 from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
 from test.integration.utils import requires_env
 from unstructured_ingest.embed.togetherai import (
     TogetherAIEmbeddingConfig,
     TogetherAIEmbeddingEncoder,
 )
+from unstructured_ingest.v2.errors import UserAuthError
 from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
 API_KEY = "TOGETHERAI_API_KEY"
@@ -41,3 +44,10 @@ def test_raw_togetherai_embedder(embedder_file: Path):
         expected_dimensions=(768,),
         expected_is_unit_vector=False,
     )
+def test_raw_togetherai_embedder_invalid_credentials():
+    embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
+    with pytest.raises(UserAuthError):
+        embedder.get_exemplary_embedding()

test/integration/partitioners/test_partitioner.py CHANGED Viewed

@@ -3,9 +3,9 @@ import os
 from pathlib import Path
 import pytest
-from unstructured_client.models.errors.sdkerror import SDKError
 from test.integration.utils import requires_env
+from unstructured_ingest.v2.errors import UserError
 from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
 int_test_dir = Path(__file__).parent
@@ -71,5 +71,5 @@ async def test_partitioner_api_fast_error(partition_file: Path):
         strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
     )
     partitioner = Partitioner(config=partitioner_config)
-    with pytest.raises(SDKError):
+    with pytest.raises(UserError):
         await partitioner.run_async(filename=partition_file)

test/unit/embed/test_octoai.py CHANGED Viewed

@@ -4,7 +4,14 @@ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbedd
 def test_embed_documents_does_not_break_element_to_dict(mocker):
     # Mocked client with the desired behavior for embed_documents
     mock_client = mocker.MagicMock()
-    mock_client.embed_documents.return_value = [1, 2]
+    mock_data = []
+    for i in range(2):
+        data = mocker.MagicMock()
+        data.embedding = [1, 2]
+        mock_data.append(data)
+    mock_response = mocker.MagicMock()
+    mock_response.data = mock_data
+    mock_client.embeddings.create.return_value = mock_response
     # Mock get_client to return our mock_client
     mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.10" # pragma: no cover
1	+ __version__ = "0.3.12" # pragma: no cover

unstructured_ingest/embed/bedrock.py CHANGED Viewed

@@ -6,7 +6,9 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
 if TYPE_CHECKING:
     from botocore.client import BaseClient
@@ -44,6 +46,32 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
 class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
     config: BedrockEmbeddingConfig
+    def wrap_error(self, e: Exception) -> Exception:
+        from botocore.exceptions import ClientError
+        if isinstance(e, ClientError):
+            # https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
+            http_response = e.response
+            meta = http_response["ResponseMetadata"]
+            http_response_code = meta["HTTPStatusCode"]
+            error_code = http_response["Error"]["Code"]
+            if http_response_code == 400:
+                if error_code == "ValidationError":
+                    return UserError(http_response["Error"])
+                elif error_code == "ThrottlingException":
+                    return RateLimitError(http_response["Error"])
+                elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
+                    return UserAuthError(http_response["Error"])
+            if http_response_code == 403:
+                return UserAuthError(http_response["Error"])
+            if 400 <= http_response_code < 500:
+                return UserError(http_response["Error"])
+            if http_response_code >= 500:
+                return ProviderError(http_response["Error"])
+        logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
+        return e
     def embed_query(self, query: str) -> list[float]:
         """Call out to Bedrock embedding endpoint."""
         # replace newlines, which can negatively affect performance.
@@ -61,25 +89,25 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
             input_body["inputText"] = text
         body = json.dumps(input_body)
+        bedrock_client = self.config.get_client()
+        # invoke bedrock API
         try:
-            bedrock_client = self.config.get_client()
-            # invoke bedrock API
             response = bedrock_client.invoke_model(
                 body=body,
                 modelId=self.config.embed_model_name,
                 accept="application/json",
                 contentType="application/json",
             )
-            # format output based on provider
-            response_body = json.loads(response.get("body").read())
-            if provider == "cohere":
-                return response_body.get("embeddings")[0]
-            else:
-                # includes common provider == "amazon"
-                return response_body.get("embedding")
         except Exception as e:
-            raise ValueError(f"Error raised by inference endpoint: {e}")
+            raise self.wrap_error(e=e)
+        # format output based on provider
+        response_body = json.loads(response.get("body").read())
+        if provider == "cohere":
+            return response_body.get("embeddings")[0]
+        else:
+            # includes common provider == "amazon"
+            return response_body.get("embedding")
     def embed_documents(self, elements: list[dict]) -> list[dict]:
         embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]

unstructured_ingest/embed/interfaces.py CHANGED Viewed

@@ -17,6 +17,11 @@ class BaseEmbeddingEncoder(ABC):
         """Initializes the embedding encoder class. Should also validate the instance
         is properly configured: e.g., embed a single a element"""
+    def wrap_error(self, e: Exception) -> Exception:
+        """Handle errors from the embedding service. Should raise a more informative error
+        if possible"""
+        return e
     @property
     def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()

unstructured_ingest/embed/octoai.py CHANGED Viewed

@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.errors import (
+    ProviderError,
+    QuotaError,
+    RateLimitError,
+    UserAuthError,
+    UserError,
+)
 if TYPE_CHECKING:
     from openai import OpenAI
@@ -30,12 +38,45 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
 class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: OctoAiEmbeddingConfig
+    def wrap_error(self, e: Exception) -> Exception:
+        # https://platform.openai.com/docs/guides/error-codes/api-errors
+        from openai import APIStatusError
+        if not isinstance(e, APIStatusError):
+            logger.error(f"unhandled exception from openai: {e}", exc_info=True)
+            raise e
+        error_code = e.code
+        if 400 <= e.status_code < 500:
+            # user error
+            if e.status_code == 401:
+                return UserAuthError(e.message)
+            if e.status_code == 429:
+                # 429 indicates rate limit exceeded and quote exceeded
+                if error_code == "insufficient_quota":
+                    return QuotaError(e.message)
+                else:
+                    return RateLimitError(e.message)
+            return UserError(e.message)
+        if e.status_code >= 500:
+            return ProviderError(e.message)
+        logger.error(f"unhandled exception from openai: {e}", exc_info=True)
+        return e
     def embed_query(self, query: str):
-        client = self.config.get_client()
-        response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
+        try:
+            client = self.config.get_client()
+            response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
+        except Exception as e:
+            raise self.wrap_error(e=e)
         return response.data[0].embedding
     def embed_documents(self, elements: list[dict]) -> list[dict]:
-        embeddings = [self.embed_query(e.get("text", "")) for e in elements]
+        texts = [e.get("text", "") for e in elements]
+        try:
+            client = self.config.get_client()
+            response = client.embeddings.create(input=texts, model=self.config.embedder_model_name)
+        except Exception as e:
+            raise self.wrap_error(e=e)
+        embeddings = [data.embedding for data in response.data]
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings

unstructured_ingest/embed/openai.py CHANGED Viewed

@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.errors import (
+    ProviderError,
+    QuotaError,
+    RateLimitError,
+    UserAuthError,
+    UserError,
+)
 if TYPE_CHECKING:
     from openai import OpenAI
@@ -25,9 +33,37 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
 class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: OpenAIEmbeddingConfig
+    def wrap_error(self, e: Exception) -> Exception:
+        # https://platform.openai.com/docs/guides/error-codes/api-errors
+        from openai import APIStatusError
+        if not isinstance(e, APIStatusError):
+            logger.error(f"unhandled exception from openai: {e}", exc_info=True)
+            raise e
+        error_code = e.code
+        if 400 <= e.status_code < 500:
+            # user error
+            if e.status_code == 401:
+                return UserAuthError(e.message)
+            if e.status_code == 429:
+                # 429 indicates rate limit exceeded and quote exceeded
+                if error_code == "insufficient_quota":
+                    return QuotaError(e.message)
+                else:
+                    return RateLimitError(e.message)
+            return UserError(e.message)
+        if e.status_code >= 500:
+            return ProviderError(e.message)
+        logger.error(f"unhandled exception from openai: {e}", exc_info=True)
+        return e
     def embed_query(self, query: str) -> list[float]:
         client = self.config.get_client()
-        response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
+        try:
+            response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
+        except Exception as e:
+            raise self.wrap_error(e=e)
         return response.data[0].embedding
     def embed_documents(self, elements: list[dict]) -> list[dict]:

unstructured_ingest/embed/togetherai.py CHANGED Viewed

@@ -4,7 +4,15 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.errors import (
+    RateLimitError as CustomRateLimitError,
+)
+from unstructured_ingest.v2.errors import (
+    UserAuthError,
+    UserError,
+)
 if TYPE_CHECKING:
     from together import Together
@@ -27,6 +35,20 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
 class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: TogetherAIEmbeddingConfig
+    def wrap_error(self, e: Exception) -> Exception:
+        # https://docs.together.ai/docs/error-codes
+        from together.error import AuthenticationError, RateLimitError, TogetherException
+        if not isinstance(e, TogetherException):
+            logger.error(f"unhandled exception from openai: {e}", exc_info=True)
+            return e
+        message = e.args[0]
+        if isinstance(e, AuthenticationError):
+            return UserAuthError(message)
+        if isinstance(e, RateLimitError):
+            return CustomRateLimitError(message)
+        return UserError(message)
     def embed_query(self, query: str) -> list[float]:
         return self._embed_documents(elements=[query])[0]
@@ -36,5 +58,10 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def _embed_documents(self, elements: list[str]) -> list[list[float]]:
         client = self.config.get_client()
-        outputs = client.embeddings.create(model=self.config.embedder_model_name, input=elements)
+        try:
+            outputs = client.embeddings.create(
+                model=self.config.embedder_model_name, input=elements
+            )
+        except Exception as e:
+            raise self.wrap_error(e=e)
         return [outputs.data[i].embedding for i in range(len(elements))]

unstructured_ingest/embed/voyageai.py CHANGED Viewed

@@ -4,7 +4,16 @@ from typing import TYPE_CHECKING, Optional
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.errors import (
+    ProviderError,
+    UserAuthError,
+    UserError,
+)
+from unstructured_ingest.v2.errors import (
+    RateLimitError as CustomRateLimitError,
+)
 if TYPE_CHECKING:
     from voyageai import Client as VoyageAIClient
@@ -38,9 +47,32 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
 class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: VoyageAIEmbeddingConfig
+    def wrap_error(self, e: Exception) -> Exception:
+        # https://docs.voyageai.com/docs/error-codes
+        from voyageai.error import AuthenticationError, RateLimitError, VoyageError
+        if not isinstance(e, VoyageError):
+            logger.error(f"unhandled exception from openai: {e}", exc_info=True)
+            raise e
+        http_code = e.http_status
+        message = e.user_message
+        if isinstance(e, AuthenticationError):
+            return UserAuthError(message)
+        if isinstance(e, RateLimitError):
+            return CustomRateLimitError(message)
+        if 400 <= http_code < 500:
+            return UserError(message)
+        if http_code >= 500:
+            return ProviderError(message)
+        logger.error(f"unhandled exception from openai: {e}", exc_info=True)
+        return e
     def _embed_documents(self, elements: list[str]) -> list[list[float]]:
         client: VoyageAIClient = self.config.get_client()
-        response = client.embed(texts=elements, model=self.config.embedder_model_name)
+        try:
+            response = client.embed(texts=elements, model=self.config.embedder_model_name)
+        except Exception as e:
+            self.wrap_error(e=e)
         return response.embeddings
     def embed_documents(self, elements: list[dict]) -> list[dict]:

unstructured_ingest/v2/errors.py ADDED Viewed

@@ -0,0 +1,18 @@
+class UserError(Exception):
+    pass
+class UserAuthError(UserError):
+    pass
+class RateLimitError(UserError):
+    pass
+class QuotaError(UserError):
+    pass
+class ProviderError(Exception):
+    pass

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -36,7 +36,7 @@ class FileDataSourceMetadata(BaseModel):
 class FileData(BaseModel):
     identifier: str
     connector_type: str
-    source_identifiers: Optional[SourceIdentifiers] = None
+    source_identifiers: SourceIdentifiers
     metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
     additional_metadata: dict[str, Any] = Field(default_factory=dict)
     reprocess: bool = False
@@ -73,6 +73,7 @@ class BatchItem(BaseModel):
 class BatchFileData(FileData):
     identifier: str = Field(init=False)
     batch_items: list[BatchItem]
+    source_identifiers: Optional[SourceIdentifiers] = None
     @field_validator("batch_items")
     @classmethod
@@ -104,3 +105,12 @@ def file_data_from_file(path: str) -> FileData:
         logger.debug(f"{path} not valid for batch file data")
     return FileData.from_file(path=path)
+def file_data_from_dict(data: dict) -> FileData:
+    try:
+        return BatchFileData.model_validate(data)
+    except ValidationError:
+        logger.debug(f"{data} not valid for batch file data")
+    return FileData.model_validate(data)

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -48,12 +48,16 @@ from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
 from .outlook import outlook_source_entry
 from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
 from .pinecone import pinecone_destination_entry
+from .redisdb import CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE
+from .redisdb import redis_destination_entry
 from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
 from .salesforce import salesforce_source_entry
 from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
 from .sharepoint import sharepoint_source_entry
 from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
 from .slack import slack_source_entry
+from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE
+from .vectara import vectara_destination_entry
 add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
 add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -101,4 +105,7 @@ add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
 add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
+add_destination_entry(destination_type=VECTARA_CONNECTOR_TYPE, entry=vectara_destination_entry)
 add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
+add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -30,6 +30,7 @@ from unstructured_ingest.v2.interfaces import (
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
+    SourceIdentifiers,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -267,6 +268,7 @@ class AstraDBDownloader(Downloader):
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
         # modify input file_data for download_response
+        file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
         cast_file_data = FileData.cast(file_data=file_data)
         cast_file_data.identifier = filename
         cast_file_data.metadata.date_processed = str(time())

unstructured_ingest/v2/processes/connectors/chroma.py CHANGED Viewed

@@ -138,7 +138,6 @@ class ChromaUploader(Uploader):
     @DestinationConnectionError.wrap
     def upsert_batch(self, collection, batch):
         try:
             # Chroma wants lists even if there is only one element
             # Upserting to prevent duplicates

unstructured_ingest/v2/processes/connectors/couchbase.py CHANGED Viewed

@@ -27,6 +27,7 @@ from unstructured_ingest.v2.interfaces import (
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
+    SourceIdentifiers,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -261,6 +262,7 @@ class CouchbaseDownloader(Downloader):
                 exc_info=True,
             )
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
         cast_file_data = FileData.cast(file_data=file_data)
         cast_file_data.identifier = filename_id
         cast_file_data.metadata.date_processed = str(time.time())

unstructured_ingest/v2/processes/connectors/databricks/volumes.py CHANGED Viewed

@@ -14,6 +14,7 @@ from unstructured_ingest.error import (
 )
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
@@ -52,6 +53,10 @@ class DatabricksPathMixin(BaseModel):
         return path
+class DatabricksVolumesAccessConfig(AccessConfig):
+    token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
 class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
     host: Optional[str] = Field(
         default=None,

unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py CHANGED Viewed

@@ -3,12 +3,12 @@ from typing import Optional
 from pydantic import Field, Secret
-from unstructured_ingest.v2.interfaces import AccessConfig
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
 )
 from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
+    DatabricksVolumesAccessConfig,
     DatabricksVolumesConnectionConfig,
     DatabricksVolumesDownloader,
     DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
 CONNECTOR_TYPE = "databricks_volumes_aws"
-class DatabricksAWSVolumesAccessConfig(AccessConfig):
+class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
         description="The Databricks account ID for the Databricks " "accounts endpoint",

unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py CHANGED Viewed

@@ -3,12 +3,12 @@ from typing import Optional
 from pydantic import Field, Secret
-from unstructured_ingest.v2.interfaces import AccessConfig
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
 )
 from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
+    DatabricksVolumesAccessConfig,
     DatabricksVolumesConnectionConfig,
     DatabricksVolumesDownloader,
     DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
 CONNECTOR_TYPE = "databricks_volumes_azure"
-class DatabricksAzureVolumesAccessConfig(AccessConfig):
+class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
         description="The Databricks account ID for the Databricks " "accounts endpoint.",

unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl