PyPI - unstructured-ingest - Versions diffs - 0.5.14__py3-none-any.whl → 0.5.16__py3-none-any.whl - Mend

unstructured-ingest 0.5.14py3-none-any.whl → 0.5.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

test/integration/connectors/test_confluence.py CHANGED Viewed

@@ -30,7 +30,7 @@ async def test_confluence_source(temp_dir):
     spaces = ["testteamsp", "MFS"]
     # Create connection and indexer configurations
-    access_config = ConfluenceAccessConfig(password=api_token)
+    access_config = ConfluenceAccessConfig(api_token=api_token)
     connection_config = ConfluenceConnectionConfig(
         url=confluence_url,
         username=user_email,
@@ -77,7 +77,7 @@ async def test_confluence_source_large(temp_dir):
     spaces = ["testteamsp1"]
     # Create connection and indexer configurations
-    access_config = ConfluenceAccessConfig(password=api_token)
+    access_config = ConfluenceAccessConfig(api_token=api_token)
     connection_config = ConfluenceConnectionConfig(
         url=confluence_url,
         username=user_email,

test/integration/connectors/test_zendesk.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 from pathlib import Path
-from typing import Optional
 import pytest
@@ -21,20 +20,20 @@ from unstructured_ingest.v2.processes.connectors.zendesk.zendesk import (
     ZendeskIndexerConfig,
 )
+SUBDOMAIN = "unstructuredhelp"
+EMAIL = "test@unstructured.io"
-async def zendesk_source_test(
-    tmp_path: Path,
-    token: Optional[str] = None,
-    email: Optional[str] = None,
-    subdomain: Optional[str] = None,
-):
-    access_config = ZendeskAccessConfig(api_token=token)
+@pytest.mark.asyncio
+@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
+@requires_env("ZENDESK_TOKEN")
+async def test_zendesk_source_tickets(temp_dir: Path):
+    access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
     connection_config = ZendeskConnectionConfig(
-        subdomain=subdomain, email=email, access_config=access_config
+        subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
     )
-    index_config = ZendeskIndexerConfig(batch_size=2, item_type="tickets")
+    index_config = ZendeskIndexerConfig(item_type="tickets")
     indexer = ZendeskIndexer(
         connection_config=connection_config,
@@ -43,7 +42,7 @@ async def zendesk_source_test(
     )
     # handle downloader.
-    download_config = ZendeskDownloaderConfig(download_dir=tmp_path)
+    download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
     downloader = ZendeskDownloader(
         connection_config=connection_config,
@@ -57,26 +56,23 @@ async def zendesk_source_test(
         downloader=downloader,
         configs=SourceValidationConfigs(
             test_id="zendesk-tickets",
-            expected_num_files=4,
+            expected_num_files=8,
             validate_file_data=False,
             validate_downloaded_files=True,
         ),
     )
-async def zendesk_source_articles_test(
-    tmp_path: Path,
-    token: Optional[str] = None,
-    email: Optional[str] = None,
-    subdomain: Optional[str] = None,
-):
-    access_config = ZendeskAccessConfig(api_token=token)
+@pytest.mark.asyncio
+@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
+@requires_env("ZENDESK_TOKEN")
+async def test_zendesk_source_articles(temp_dir):
+    access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
     connection_config = ZendeskConnectionConfig(
-        subdomain=subdomain, email=email, access_config=access_config
+        subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
     )
-    index_config = ZendeskIndexerConfig(batch_size=2, item_type="articles")
+    index_config = ZendeskIndexerConfig(item_type="articles")
     indexer = ZendeskIndexer(
         connection_config=connection_config,
@@ -85,7 +81,7 @@ async def zendesk_source_articles_test(
     )
     # handle downloader.
-    download_config = ZendeskDownloaderConfig(download_dir=tmp_path, extract_images=True)
+    download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
     downloader = ZendeskDownloader(
         connection_config=connection_config,
@@ -99,44 +95,26 @@ async def zendesk_source_articles_test(
         downloader=downloader,
         configs=SourceValidationConfigs(
             test_id="zendesk-articles",
-            expected_num_files=4,
-            validate_file_data=False,
+            expected_num_files=8,
+            validate_file_data=True,
             validate_downloaded_files=True,
         ),
     )
-@pytest.mark.asyncio
 @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
-@requires_env("ZENDESK_TOKEN")
-async def test_zendesk_source(temp_dir):
-    await zendesk_source_test(
-        tmp_path=temp_dir,
-        token=os.environ["ZENDESK_TOKEN"],
-        email="test@unstructured.io",
-        subdomain="unstructuredhelp",
+def test_zendesk_source_articles_fail(temp_dir):
+    access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
+    connection_config = ZendeskConnectionConfig(
+        subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
     )
+    index_config = ZendeskIndexerConfig(item_type="tickets")
-@pytest.mark.asyncio
-@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
-@requires_env("ZENDESK_TOKEN")
-async def test_zendesk_source_articles(temp_dir):
-    await zendesk_source_articles_test(
-        tmp_path=temp_dir,
-        token=os.environ["ZENDESK_TOKEN"],
-        email="test@unstructured.io",
-        subdomain="unstructuredhelp",
+    indexer = ZendeskIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+        connector_type=CONNECTOR_TYPE,
     )
-@pytest.mark.asyncio
-@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
-async def test_zendesk_source_articles_fail(temp_dir):
     with pytest.raises(expected_exception=UserAuthError):
-        await zendesk_source_articles_test(
-            tmp_path=temp_dir,
-            token="FORCE_FAIL_TOKEN",
-            email="test@unstructured.io",
-            subdomain="unstructuredhelp",
-        )
+        indexer.precheck()

test/integration/connectors/utils/validation/source.py CHANGED Viewed

@@ -103,7 +103,7 @@ def check_contents(
         file_data_path = expected_output_dir / f"{file_data.identifier}.json"
         with file_data_path.open("r") as file:
             expected_file_data_contents = json.load(file)
-        current_file_data_contents = file_data.model_dump()
+        current_file_data_contents = json.loads(file_data.model_dump_json())
         expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
         current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
         diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -184,7 +184,7 @@ def update_fixtures(
         for file_data in all_file_data:
             file_data_path = file_data_output_path / f"{file_data.identifier}.json"
             with file_data_path.open(mode="w") as f:
-                json.dump(file_data.model_dump(), f, indent=2)
+                f.write(file_data.model_dump_json(indent=2))
     # Record file structure of download directory
     download_files = get_files(dir_path=download_dir)
@@ -216,7 +216,9 @@ def run_all_validations(
             len(predownload_file_data) == expected_number_indexed_file_data
         ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
     if expected_num_files := configs.expected_num_files:
-        assert len(postdownload_file_data) == expected_num_files
+        assert (
+            len(postdownload_file_data) == expected_num_files
+        ), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
     for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
         configs.run_file_data_validation(

test/unit/v2/connectors/test_confluence.py CHANGED Viewed

@@ -11,7 +11,7 @@ def test_connection_config_multiple_auth():
     with pytest.raises(ValidationError):
         ConfluenceConnectionConfig(
             access_config=ConfluenceAccessConfig(
-                password="api_token",
+                password="password",
                 token="access_token",
             ),
             username="user_email",
@@ -19,14 +19,46 @@ def test_connection_config_multiple_auth():
         )
+def test_connection_config_multiple_auth2():
+    with pytest.raises(ValidationError):
+        ConfluenceConnectionConfig(
+            access_config=ConfluenceAccessConfig(
+                api_token="api_token",
+                token="access_token",
+            ),
+            username="user_email",
+            url="url",
+        )
+def test_connection_config_multiple_auth3():
+    with pytest.raises(ValidationError):
+        ConfluenceConnectionConfig(
+            access_config=ConfluenceAccessConfig(
+                api_token="api_token",
+                password="password",
+            ),
+            username="user_email",
+            url="url",
+        )
 def test_connection_config_no_auth():
     with pytest.raises(ValidationError):
         ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
-def test_connection_config_basic_auth():
+def test_connection_config_password_auth():
+    ConfluenceConnectionConfig(
+        access_config=ConfluenceAccessConfig(password="password"),
+        url="url",
+        username="user_email",
+    )
+def test_connection_config_api_token_auth():
     ConfluenceConnectionConfig(
-        access_config=ConfluenceAccessConfig(password="api_token"),
+        access_config=ConfluenceAccessConfig(api_token="api_token"),
         url="url",
         username="user_email",
     )

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.14" # pragma: no cover
1	+ __version__ = "0.5.16" # pragma: no cover

unstructured_ingest/embed/huggingface.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Optional
 from pydantic import Field
@@ -15,14 +15,11 @@ if TYPE_CHECKING:
 class HuggingFaceEmbeddingConfig(EmbeddingConfig):
-    embedder_model_name: Optional[str] = Field(
-        default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
-    )
+    embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
     embedder_model_kwargs: Optional[dict] = Field(
         default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
     )
     encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
-    cache_folder: Optional[str] = Field(default=None)
     @requires_dependencies(
         ["sentence_transformers"],
@@ -33,7 +30,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
         return SentenceTransformer(
             model_name_or_path=self.embedder_model_name,
-            cache_folder=self.cache_folder,
             **self.embedder_model_kwargs,
         )
@@ -45,7 +41,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
 @dataclass
 class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
-    config: HuggingFaceEmbeddingConfig
+    config: HuggingFaceEmbeddingConfig = field(default_factory=HuggingFaceEmbeddingConfig)
     def _embed_query(self, query: str) -> list[float]:
         return self._embed_documents(texts=[query])[0]

unstructured_ingest/utils/data_prep.py CHANGED Viewed

@@ -2,7 +2,7 @@ import itertools
 import json
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
+from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
 import pandas as pd
@@ -163,7 +163,9 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
             raise IOError("Unsupported file type: {path}")
-def get_data(path: Path) -> list[dict]:
+def get_data(path: Union[Path, str]) -> list[dict]:
+    if isinstance(path, str):
+        path = Path(path)
     try:
         return get_data_by_suffix(path=path)
     except Exception as e:

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -102,7 +102,7 @@ def file_data_from_file(path: str) -> FileData:
     try:
         return BatchFileData.from_file(path=path)
     except ValidationError:
-        logger.debug(f"{path} not valid for batch file data")
+        logger.debug(f"{path} not detected as batch file data")
     return FileData.from_file(path=path)

unstructured_ingest/v2/interfaces/upload_stager.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import json
 from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
@@ -7,6 +6,7 @@ from typing import Any, TypeVar
 from pydantic import BaseModel
 from unstructured_ingest.utils import ndjson
+from unstructured_ingest.utils.data_prep import get_data, write_data
 from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.interfaces.process import BaseProcess
@@ -43,16 +43,13 @@ class UploadStager(BaseProcess, ABC):
                     writer.f.flush()
     def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
-        with input_file.open() as in_f:
-            elements_contents = json.load(in_f)
+        elements_contents = get_data(path=input_file)
         conformed_elements = [
             self.conform_dict(element_dict=element, file_data=file_data)
             for element in elements_contents
         ]
-        with open(output_file, "w") as out_f:
-            json.dump(conformed_elements, out_f, indent=2)
+        write_data(path=output_file, data=conformed_elements)
     def run(
         self,

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -108,6 +108,13 @@ class Pipeline:
         uploader_connector_type = self.uploader_step.process.connector_type
         registry_entry = destination_registry[uploader_connector_type]
         if registry_entry.upload_stager and self.stager_step is None:
+            try:
+                self.stager_step = UploadStageStep(
+                    process=registry_entry.upload_stager(), context=self.context
+                )
+                return
+            except Exception as e:
+                logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
             raise ValueError(
                 f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
                 f"expects a stager of type {registry_entry.upload_stager.__name__} "

unstructured_ingest/v2/pipeline/steps/chunk.py CHANGED Viewed

@@ -38,7 +38,7 @@ class ChunkStep(PipelineStep):
         return not filepath.exists()
     def get_output_filepath(self, filename: Path) -> Path:
-        hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
+        hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
         filepath = (self.cache_dir / hashed_output_file).resolve()
         filepath.parent.mkdir(parents=True, exist_ok=True)
         return filepath

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -88,9 +88,9 @@ class DownloadStep(PipelineStep):
                 f"match size of local file: {file_size_bytes}, updating"
             )
             file_data.metadata.filesize_bytes = file_size_bytes
-        logger.debug(f"updating file data with new content: {file_data.model_dump()}")
+        logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
         with file_data_path.open("w") as file:
-            json.dump(file_data.model_dump(), file, indent=2)
+            file.write(file_data.model_dump_json(indent=2))
     async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
         file_data = file_data_from_file(path=file_data_path)
@@ -173,7 +173,7 @@ class DownloadStep(PipelineStep):
         filepath = (self.cache_dir / filename).resolve()
         filepath.parent.mkdir(parents=True, exist_ok=True)
         with open(str(filepath), "w") as f:
-            json.dump(file_data.model_dump(), f, indent=2)
+            f.write(file_data.model_dump_json(indent=2))
         return str(filepath)
     def get_hash(self, extras: Optional[list[str]]) -> str:

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -38,7 +38,7 @@ class EmbedStep(PipelineStep):
         return not filepath.exists()
     def get_output_filepath(self, filename: Path) -> Path:
-        hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
+        hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
         filepath = (self.cache_dir / hashed_output_file).resolve()
         filepath.parent.mkdir(parents=True, exist_ok=True)
         return filepath

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
     @instrument(span_name=STEP_ID)
     def run(self) -> Generator[str, None, None]:
         for file_data in self.process.run():
-            logger.debug(f"generated file data: {file_data.model_dump()}")
+            logger.debug(f"generated file data: {file_data.model_dump_json()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"
                 filepath = (self.cache_dir / filename).resolve()
                 filepath.parent.mkdir(parents=True, exist_ok=True)
                 with open(str(filepath), "w") as f:
-                    json.dump(file_data.model_dump(), f, indent=2)
+                    f.write(file_data.model_dump_json(indent=2))
                 yield str(filepath)
             except Exception as e:
                 logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
     async def run_async(self) -> AsyncGenerator[str, None]:
         async for file_data in self.process.run_async():
-            logger.debug(f"generated file data: {file_data.model_dump()}")
+            logger.debug(f"generated file data: {file_data.model_dump_json()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"
                 filepath = (self.cache_dir / filename).resolve()
                 filepath.parent.mkdir(parents=True, exist_ok=True)
                 with open(str(filepath), "w") as f:
-                    json.dump(file_data.model_dump(), f, indent=2)
+                    f.write(file_data.model_dump_json(indent=2))
                 yield str(filepath)
             except Exception as e:
                 logger.error(f"failed to create index for file data: {file_data}", exc_info=True)

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
         return not filepath.exists()
     def get_output_filepath(self, filename: Path) -> Path:
-        hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
+        hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
         filepath = (self.cache_dir / hashed_output_file).resolve()
         filepath.parent.mkdir(parents=True, exist_ok=True)
         return filepath

unstructured_ingest/v2/processes/connectors/confluence.py CHANGED Viewed

@@ -35,7 +35,11 @@ CONNECTOR_TYPE = "confluence"
 class ConfluenceAccessConfig(AccessConfig):
     password: Optional[str] = Field(
-        description="Confluence password or Cloud API token",
+        description="Confluence password",
+        default=None,
+    )
+    api_token: Optional[str] = Field(
+        description="Confluence Cloud API token",
         default=None,
     )
     token: Optional[str] = Field(
@@ -57,7 +61,12 @@ class ConfluenceConnectionConfig(ConnectionConfig):
     def model_post_init(self, __context):
         access_configs = self.access_config.get_secret_value()
-        basic_auth = self.username and access_configs.password
+        if access_configs.password and access_configs.api_token:
+            raise ValueError(
+                "both password and api_token provided, only one allowed, "
+                "see: https://atlassian-python-api.readthedocs.io/"
+            )
+        basic_auth = bool(self.username and (access_configs.password or access_configs.api_token))
         pat_auth = access_configs.token
         if self.cloud and not basic_auth:
             raise ValueError(
@@ -74,6 +83,14 @@ class ConfluenceConnectionConfig(ConnectionConfig):
                 "no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
             )
+    def password_or_api_token(self) -> str:
+        # Confluence takes either password or API token under the same field: password
+        # This ambiguity led to confusion, so we are making it specific what you are passing in
+        access_configs = self.access_config.get_secret_value()
+        if access_configs.password:
+            return access_configs.password
+        return access_configs.api_token
     @requires_dependencies(["atlassian"], extras="confluence")
     @contextmanager
     def get_client(self) -> "Confluence":
@@ -83,7 +100,7 @@ class ConfluenceConnectionConfig(ConnectionConfig):
         with Confluence(
             url=self.url,
             username=self.username,
-            password=access_configs.password,
+            password=self.password_or_api_token(),
             token=access_configs.token,
             cloud=self.cloud,
         ) as client:

unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py CHANGED Viewed

@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
     DatabricksVolumesUploader,
     DatabricksVolumesUploaderConfig,
 )
+from unstructured_ingest.v2.processes.utils.blob_storage import (
+    BlobStoreUploadStager,
+    BlobStoreUploadStagerConfig,
+)
 CONNECTOR_TYPE = "databricks_volumes_aws"
@@ -76,6 +80,8 @@ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
     connection_config=DatabricksAWSVolumesConnectionConfig,
     uploader=DatabricksAWSVolumesUploader,
     uploader_config=DatabricksAWSVolumesUploaderConfig,
+    upload_stager_config=BlobStoreUploadStagerConfig,
+    upload_stager=BlobStoreUploadStager,
 )
 databricks_aws_volumes_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py CHANGED Viewed

@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
     DatabricksVolumesUploader,
     DatabricksVolumesUploaderConfig,
 )
+from unstructured_ingest.v2.processes.utils.blob_storage import (
+    BlobStoreUploadStager,
+    BlobStoreUploadStagerConfig,
+)
 CONNECTOR_TYPE = "databricks_volumes_azure"
@@ -91,6 +95,8 @@ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
     connection_config=DatabricksAzureVolumesConnectionConfig,
     uploader=DatabricksAzureVolumesUploader,
     uploader_config=DatabricksAzureVolumesUploaderConfig,
+    upload_stager_config=BlobStoreUploadStagerConfig,
+    upload_stager=BlobStoreUploadStager,
 )
 databricks_azure_volumes_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py CHANGED Viewed

@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
     DatabricksVolumesUploader,
     DatabricksVolumesUploaderConfig,
 )
+from unstructured_ingest.v2.processes.utils.blob_storage import (
+    BlobStoreUploadStager,
+    BlobStoreUploadStagerConfig,
+)
 CONNECTOR_TYPE = "databricks_volumes_gcp"
@@ -74,6 +78,8 @@ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
     connection_config=DatabricksGoogleVolumesConnectionConfig,
     uploader=DatabricksGoogleVolumesUploader,
     uploader_config=DatabricksGoogleVolumesUploaderConfig,
+    upload_stager_config=BlobStoreUploadStagerConfig,
+    upload_stager=BlobStoreUploadStager,
 )
 databricks_gcp_volumes_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py CHANGED Viewed

@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
     DatabricksVolumesUploader,
     DatabricksVolumesUploaderConfig,
 )
+from unstructured_ingest.v2.processes.utils.blob_storage import (
+    BlobStoreUploadStager,
+    BlobStoreUploadStagerConfig,
+)
 CONNECTOR_TYPE = "databricks_volumes"
@@ -75,6 +79,8 @@ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
     connection_config=DatabricksNativeVolumesConnectionConfig,
     uploader=DatabricksNativeVolumesUploader,
     uploader_config=DatabricksNativeVolumesUploaderConfig,
+    upload_stager_config=BlobStoreUploadStagerConfig,
+    upload_stager=BlobStoreUploadStager,
 )
 databricks_native_volumes_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py CHANGED Viewed

@@ -61,7 +61,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
                         self.upload_config.database, ", ".join(databases)
                     )
                 )
-            cursor.execute("SHOW TABLES")
+            cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
             table_names = [r[1] for r in cursor.fetchall()]
             if self.upload_config.table_name not in table_names:
                 raise ValueError(

unstructured_ingest/v2/processes/connectors/fsspec/azure.py CHANGED Viewed

@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
     FsspecUploaderConfig,
 )
 from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
+from unstructured_ingest.v2.processes.utils.blob_storage import (
+    BlobStoreUploadStager,
+    BlobStoreUploadStagerConfig,
+)
 if TYPE_CHECKING:
     from adlfs import AzureBlobFileSystem
@@ -194,4 +198,6 @@ azure_destination_entry = DestinationRegistryEntry(
     uploader=AzureUploader,
     uploader_config=AzureUploaderConfig,
     connection_config=AzureConnectionConfig,
+    upload_stager_config=BlobStoreUploadStagerConfig,
+    upload_stager=BlobStoreUploadStager,
 )

unstructured_ingest/v2/processes/connectors/fsspec/box.py CHANGED Viewed

@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
     FsspecUploaderConfig,
 )
 from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
+from unstructured_ingest.v2.processes.utils.blob_storage import (
+    BlobStoreUploadStager,
+    BlobStoreUploadStagerConfig,
+)
 if TYPE_CHECKING:
     from boxfs import BoxFileSystem
@@ -167,4 +171,6 @@ box_destination_entry = DestinationRegistryEntry(
     uploader=BoxUploader,
     uploader_config=BoxUploaderConfig,
     connection_config=BoxConnectionConfig,
+    upload_stager_config=BlobStoreUploadStagerConfig,
+    upload_stager=BlobStoreUploadStager,
 )

unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py CHANGED Viewed

@@ -31,6 +31,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
     FsspecUploader,
     FsspecUploaderConfig,
 )
+from unstructured_ingest.v2.processes.utils.blob_storage import (
+    BlobStoreUploadStager,
+    BlobStoreUploadStagerConfig,
+)
 if TYPE_CHECKING:
     pass
@@ -228,4 +232,6 @@ dropbox_destination_entry = DestinationRegistryEntry(
     uploader=DropboxUploader,
     uploader_config=DropboxUploaderConfig,
     connection_config=DropboxConnectionConfig,
+    upload_stager_config=BlobStoreUploadStagerConfig,
+    upload_stager=BlobStoreUploadStager,
 )

unstructured-ingest 0.5.14__py3-none-any.whl → 0.5.16__py3-none-any.whl

unstructured-ingest 0.5.14py3-none-any.whl → 0.5.16py3-none-any.whl