PyPI - unstructured-ingest - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (59) hide show

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -14,8 +14,8 @@ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
 from .airtable import airtable_source_entry
 from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
 from .astradb import astra_db_destination_entry, astra_db_source_entry
-from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
-from .azure_cognitive_search import azure_cognitive_search_destination_entry
+from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
+from .azure_ai_search import azure_ai_search_destination_entry
 from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
 from .chroma import chroma_destination_entry
 from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
@@ -97,8 +97,8 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
 add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
 add_destination_entry(
-    destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
-    entry=azure_cognitive_search_destination_entry,
+    destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
+    entry=azure_ai_search_destination_entry,
 )
 add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)

unstructured_ingest/v2/processes/connectors/airtable.py CHANGED Viewed

@@ -12,11 +12,11 @@ from unstructured_ingest.v2.interfaces import (
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
+    DownloadResponse,
     FileData,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    download_responses,
 )
 from unstructured_ingest.v2.processes.connector_registry import (
     SourceRegistryEntry,
@@ -214,7 +214,7 @@ class AirtableDownloader(Downloader):
         row_dict.update(table_row["fields"])
         return row_dict
-    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
         table_contents = self.get_table_contents(table_meta=table_meta)
         df = pandas.DataFrame.from_dict(

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
 )
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -129,11 +130,6 @@ class AstraDBIndexerConfig(IndexerConfig):
         "numbers, and underscores."
     )
     keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
-    namespace: Optional[str] = Field(
-        default=None,
-        description="The Astra DB connection namespace.",
-        deprecated="Please use 'keyspace' instead.",
-    )
     batch_size: int = Field(default=20, description="Number of records per batch")
@@ -147,21 +143,17 @@ class AstraDBUploaderConfig(UploaderConfig):
         "Note that the collection name must only include letters, "
         "numbers, and underscores."
     )
-    embedding_dimension: int = Field(
-        default=384, description="The dimensionality of the embeddings"
-    )
     keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
-    namespace: Optional[str] = Field(
-        default=None,
-        description="The Astra DB connection namespace.",
-        deprecated="Please use 'keyspace' instead.",
-    )
     requested_indexing_policy: Optional[dict[str, Any]] = Field(
         default=None,
         description="The indexing policy to use for the collection.",
         examples=['{"deny": ["metadata"]}'],
     )
     batch_size: int = Field(default=20, description="Number of records per batch")
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
@@ -173,7 +165,7 @@ class AstraDBIndexer(Indexer):
         return get_astra_collection(
             connection_config=self.connection_config,
             collection_name=self.index_config.collection_name,
-            keyspace=self.index_config.keyspace or self.index_config.namespace,
+            keyspace=self.index_config.keyspace,
         )
     def precheck(self) -> None:
@@ -223,7 +215,7 @@ class AstraDBIndexer(Indexer):
                 additional_metadata={
                     "ids": list(batch),
                     "collection_name": self.index_config.collection_name,
-                    "keyspace": self.index_config.keyspace or self.index_config.namespace,
+                    "keyspace": self.index_config.keyspace,
                 },
             )
             yield fd
@@ -309,10 +301,11 @@ class AstraDBUploadStager(UploadStager):
         default_factory=lambda: AstraDBUploadStagerConfig()
     )
-    def conform_dict(self, element_dict: dict) -> dict:
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         return {
             "$vector": element_dict.pop("embeddings", None),
             "content": element_dict.pop("text", None),
+            RECORD_ID_LABEL: file_data.identifier,
             "metadata": element_dict,
         }
@@ -328,10 +321,15 @@ class AstraDBUploadStager(UploadStager):
             elements_contents = json.load(elements_file)
         conformed_elements = []
         for element in elements_contents:
-            conformed_elements.append(self.conform_dict(element_dict=element))
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
+            conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
+        output_filename_path = Path(output_filename)
+        if output_filename_path.suffix == ".json":
+            output_path = Path(output_dir) / output_filename_path
+        else:
+            output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file)
+            json.dump(conformed_elements, output_file, indent=2)
         return output_path
@@ -346,7 +344,7 @@ class AstraDBUploader(Uploader):
             get_astra_collection(
                 connection_config=self.connection_config,
                 collection_name=self.upload_config.collection_name,
-                keyspace=self.upload_config.keyspace or self.upload_config.namespace,
+                keyspace=self.upload_config.keyspace,
             )
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
@@ -357,7 +355,19 @@ class AstraDBUploader(Uploader):
         return get_astra_collection(
             connection_config=self.connection_config,
             collection_name=self.upload_config.collection_name,
-            keyspace=self.upload_config.keyspace or self.upload_config.namespace,
+            keyspace=self.upload_config.keyspace,
+        )
+    def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
+        logger.debug(
+            f"deleting records from collection {collection.name} "
+            f"with {self.upload_config.record_id_key} "
+            f"set to {file_data.identifier}"
+        )
+        delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
+        delete_resp = collection.delete_many(filter=delete_filter)
+        logger.debug(
+            f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
         )
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -371,6 +381,8 @@ class AstraDBUploader(Uploader):
         astra_db_batch_size = self.upload_config.batch_size
         collection = self.get_collection()
+        self.delete_by_record_id(collection=collection, file_data=file_data)
         for chunk in batch_generator(elements_dict, astra_db_batch_size):
             collection.insert_many(chunk)

unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} RENAMED Viewed

@@ -9,6 +9,7 @@ from pydantic import Field, Secret
 from unstructured_ingest.error import DestinationConnectionError, WriteError
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -26,18 +27,18 @@ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
 if TYPE_CHECKING:
     from azure.search.documents import SearchClient
+    from azure.search.documents.indexes import SearchIndexClient
+CONNECTOR_TYPE = "azure_ai_search"
-CONNECTOR_TYPE = "azure_cognitive_search"
-class AzureCognitiveSearchAccessConfig(AccessConfig):
-    azure_cognitive_search_key: str = Field(
+class AzureAISearchAccessConfig(AccessConfig):
+    azure_ai_search_key: str = Field(
         alias="key", description="Credential that is used for authenticating to an Azure service"
     )
-class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
+class AzureAISearchConnectionConfig(ConnectionConfig):
     endpoint: str = Field(
         description="The URL endpoint of an Azure AI (Cognitive) search service. "
         "In the form of https://{{service_name}}.search.windows.net"
@@ -45,10 +46,10 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
     index: str = Field(
         description="The name of the Azure AI (Cognitive) Search index to connect to."
     )
-    access_config: Secret[AzureCognitiveSearchAccessConfig]
+    access_config: Secret[AzureAISearchAccessConfig]
-    @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
-    def generate_client(self) -> "SearchClient":
+    @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
+    def get_search_client(self) -> "SearchClient":
         from azure.core.credentials import AzureKeyCredential
         from azure.search.documents import SearchClient
@@ -56,27 +57,43 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
             endpoint=self.endpoint,
             index_name=self.index,
             credential=AzureKeyCredential(
-                self.access_config.get_secret_value().azure_cognitive_search_key
+                self.access_config.get_secret_value().azure_ai_search_key
+            ),
+        )
+    @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
+    def get_search_index_client(self) -> "SearchIndexClient":
+        from azure.core.credentials import AzureKeyCredential
+        from azure.search.documents.indexes import SearchIndexClient
+        return SearchIndexClient(
+            endpoint=self.endpoint,
+            credential=AzureKeyCredential(
+                self.access_config.get_secret_value().azure_ai_search_key
             ),
         )
-class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
+class AzureAISearchUploadStagerConfig(UploadStagerConfig):
     pass
-class AzureCognitiveSearchUploaderConfig(UploaderConfig):
+class AzureAISearchUploaderConfig(UploaderConfig):
     batch_size: int = Field(default=100, description="Number of records per batch")
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
-class AzureCognitiveSearchUploadStager(UploadStager):
-    upload_stager_config: AzureCognitiveSearchUploadStagerConfig = field(
-        default_factory=lambda: AzureCognitiveSearchUploadStagerConfig()
+class AzureAISearchUploadStager(UploadStager):
+    upload_stager_config: AzureAISearchUploadStagerConfig = field(
+        default_factory=lambda: AzureAISearchUploadStagerConfig()
     )
     @staticmethod
-    def conform_dict(data: dict) -> dict:
+    def conform_dict(data: dict, file_data: FileData) -> dict:
         """
         updates the dictionary that is from each Element being converted into a dict/json
         into a dictionary that conforms to the schema expected by the
@@ -84,6 +101,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
         """
         data["id"] = str(uuid.uuid4())
+        data[RECORD_ID_LABEL] = file_data.identifier
         if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
             data["metadata"]["coordinates"]["points"] = json.dumps(points)
@@ -124,6 +142,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
     def run(
         self,
+        file_data: FileData,
         elements_filepath: Path,
         output_dir: Path,
         output_filename: str,
@@ -132,23 +151,59 @@ class AzureCognitiveSearchUploadStager(UploadStager):
         with open(elements_filepath) as elements_file:
             elements_contents = json.load(elements_file)
-        conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
+        conformed_elements = [
+            self.conform_dict(data=element, file_data=file_data) for element in elements_contents
+        ]
         output_path = Path(output_dir) / Path(f"{output_filename}.json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file)
+            json.dump(conformed_elements, output_file, indent=2)
         return output_path
 @dataclass
-class AzureCognitiveSearchUploader(Uploader):
-    upload_config: AzureCognitiveSearchUploaderConfig
-    connection_config: AzureCognitiveSearchConnectionConfig
+class AzureAISearchUploader(Uploader):
+    upload_config: AzureAISearchUploaderConfig
+    connection_config: AzureAISearchConnectionConfig
     connector_type: str = CONNECTOR_TYPE
+    def query_docs(self, record_id: str, index_key: str) -> list[str]:
+        client = self.connection_config.get_search_client()
+        results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
+        return [result[index_key] for result in results]
+    def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
+        logger.debug(
+            f"deleting any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from azure cognitive search index: {self.connection_config.index}"
+        )
+        doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
+        if not doc_ids_to_delete:
+            return
+        client: SearchClient = self.connection_config.get_search_client()
+        results = client.delete_documents(
+            documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
+        )
+        errors = []
+        success = []
+        for result in results:
+            if result.succeeded:
+                success.append(result)
+            else:
+                errors.append(result)
+        logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
+        if errors:
+            raise WriteError(
+                ", ".join(
+                    [f"[{error.status_code}] {error.error_message}" for error in errors],
+                ),
+            )
     @DestinationConnectionError.wrap
-    @requires_dependencies(["azure"], extras="azure-cognitive-search")
-    def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
+    @requires_dependencies(["azure"], extras="azure-ai-search")
+    def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
         import azure.core.exceptions
         logger.info(
@@ -156,7 +211,7 @@ class AzureCognitiveSearchUploader(Uploader):
             f"index at {self.connection_config.index}",
         )
         try:
-            results = self.connection_config.generate_client().upload_documents(
+            results = self.connection_config.get_search_client().upload_documents(
                 documents=elements_dict
             )
@@ -174,24 +229,42 @@ class AzureCognitiveSearchUploader(Uploader):
             raise WriteError(
                 ", ".join(
                     [
-                        f"{error.azure_cognitive_search_key}: "
+                        f"{error.azure_ai_search_key}: "
                         f"[{error.status_code}] {error.error_message}"
                         for error in errors
                     ],
                 ),
             )
+    def can_delete(self) -> bool:
+        search_index_client = self.connection_config.get_search_index_client()
+        index = search_index_client.get_index(name=self.connection_config.index)
+        index_fields = index.fields
+        record_id_fields = [
+            field for field in index_fields if field.name == self.upload_config.record_id_key
+        ]
+        if not record_id_fields:
+            return False
+        record_id_field = record_id_fields[0]
+        return record_id_field.filterable
+    def get_index_key(self) -> str:
+        search_index_client = self.connection_config.get_search_index_client()
+        index = search_index_client.get_index(name=self.connection_config.index)
+        index_fields = index.fields
+        key_fields = [field for field in index_fields if field.key]
+        if not key_fields:
+            raise ValueError("no key field found in index fields")
+        return key_fields[0].name
     def precheck(self) -> None:
         try:
-            client = self.connection_config.generate_client()
+            client = self.connection_config.get_search_client()
             client.get_document_count()
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def write_dict_wrapper(self, elements_dict):
-        return self.write_dict(elements_dict=elements_dict)
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         with path.open("r") as file:
             elements_dict = json.load(file)
@@ -201,17 +274,21 @@ class AzureCognitiveSearchUploader(Uploader):
             f" index at {str(self.connection_config.index)}"
             f" with batch size {str(self.upload_config.batch_size)}"
         )
+        if self.can_delete():
+            index_key = self.get_index_key()
+            self.delete_by_record_id(file_data=file_data, index_key=index_key)
+        else:
+            logger.warning("criteria for deleting previous content not met, skipping")
         batch_size = self.upload_config.batch_size
         for chunk in batch_generator(elements_dict, batch_size):
             self.write_dict(elements_dict=chunk)  # noqa: E203
-azure_cognitive_search_destination_entry = DestinationRegistryEntry(
-    connection_config=AzureCognitiveSearchConnectionConfig,
-    uploader=AzureCognitiveSearchUploader,
-    uploader_config=AzureCognitiveSearchUploaderConfig,
-    upload_stager=AzureCognitiveSearchUploadStager,
-    upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
+azure_ai_search_destination_entry = DestinationRegistryEntry(
+    connection_config=AzureAISearchConnectionConfig,
+    uploader=AzureAISearchUploader,
+    uploader_config=AzureAISearchUploaderConfig,
+    upload_stager=AzureAISearchUploadStager,
+    upload_stager_config=AzureAISearchUploadStagerConfig,
 )

unstructured_ingest/v2/processes/connectors/confluence.py CHANGED Viewed

@@ -11,12 +11,12 @@ from unstructured_ingest.v2.interfaces import (
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
+    DownloadResponse,
     FileData,
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
@@ -154,7 +154,7 @@ class ConfluenceDownloader(Downloader):
     download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
     connector_type: str = CONNECTOR_TYPE
-    def run(self, file_data: FileData, **kwargs) -> download_responses:
+    def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
         doc_id = file_data.identifier
         try:
             client = self.connection_config.get_client()

unstructured_ingest/v2/processes/connectors/couchbase.py CHANGED Viewed

@@ -205,6 +205,7 @@ class CouchbaseIndexer(Indexer):
             yield FileData(
                 identifier=identified,
                 connector_type=CONNECTOR_TYPE,
+                doc_type="batch",
                 metadata=FileDataSourceMetadata(
                     url=f"{self.connection_config.connection_string}/"
                     f"{self.connection_config.bucket}",

unstructured_ingest/v2/processes/connectors/delta_table.py CHANGED Viewed

@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
 from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Optional
+from urllib.parse import urlparse
 import pandas as pd
 from pydantic import Field, Secret
@@ -94,7 +95,7 @@ class DeltaTableUploader(Uploader):
     connection_config: DeltaTableConnectionConfig
     connector_type: str = CONNECTOR_TYPE
-    @requires_dependencies(["s3fs", "fsspec"], extras="s3")
+    @requires_dependencies(["boto3"], extras="delta-table")
     def precheck(self):
         secrets = self.connection_config.access_config.get_secret_value()
         if (
@@ -102,13 +103,24 @@ class DeltaTableUploader(Uploader):
             and secrets.aws_access_key_id
             and secrets.aws_secret_access_key
         ):
-            from fsspec import get_filesystem_class
+            from boto3 import client
+            url = urlparse(self.connection_config.table_uri)
+            bucket_name = url.netloc
+            dir_path = url.path.lstrip("/")
             try:
-                fs = get_filesystem_class("s3")(
-                    key=secrets.aws_access_key_id, secret=secrets.aws_secret_access_key
+                s3_client = client(
+                    "s3",
+                    aws_access_key_id=secrets.aws_access_key_id,
+                    aws_secret_access_key=secrets.aws_secret_access_key,
                 )
-                fs.write_bytes(path=self.connection_config.table_uri, value=b"")
+                s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
+                response = s3_client.get_bucket_location(Bucket=bucket_name)
+                if self.connection_config.aws_region != response.get("LocationConstraint"):
+                    raise ValueError("Wrong AWS Region was provided.")
             except Exception as e:
                 logger.error(f"failed to validate connection: {e}", exc_info=True)

unstructured_ingest/v2/processes/connectors/elasticsearch.py CHANGED Viewed

@@ -191,6 +191,7 @@ class ElasticsearchIndexer(Indexer):
             yield FileData(
                 identifier=identified,
                 connector_type=CONNECTOR_TYPE,
+                doc_type="batch",
                 metadata=FileDataSourceMetadata(
                     url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
                     date_processed=str(time()),

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from __future__ import annotations
+import os
 import random
+import shutil
+import tempfile
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -207,12 +210,35 @@ class FsspecDownloader(Downloader):
             **self.connection_config.get_access_config(),
         )
+    def handle_directory_download(self, lpath: Path) -> None:
+        # If the object's name contains certain characters (i.e. '?'), it
+        # gets downloaded into a new directory of the same name. This
+        # reconciles that with what is expected, which is to download it
+        # as a file that is not within a directory.
+        if not lpath.is_dir():
+            return
+        desired_name = lpath.name
+        files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
+        if not files_in_dir:
+            raise ValueError(f"no files in {lpath}")
+        if len(files_in_dir) > 1:
+            raise ValueError(
+                "Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
+            )
+        file = files_in_dir[0]
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_location = os.path.join(temp_dir, desired_name)
+            shutil.copyfile(src=file, dst=temp_location)
+            shutil.rmtree(lpath)
+            shutil.move(src=temp_location, dst=lpath)
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         try:
             rpath = file_data.additional_metadata["original_file_path"]
             self.fs.get(rpath=rpath, lpath=download_path.as_posix())
+            self.handle_directory_download(lpath=download_path)
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -224,6 +250,7 @@ class FsspecDownloader(Downloader):
         try:
             rpath = file_data.additional_metadata["original_file_path"]
             await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
+            self.handle_directory_download(lpath=download_path)
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -19,12 +19,12 @@ from unstructured_ingest.v2.interfaces import (
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
+    DownloadResponse,
     FileData,
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
@@ -294,7 +294,7 @@ class GoogleDriveDownloader(Downloader):
             _, downloaded = downloader.next_chunk()
         return downloaded
-    def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
+    def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
@@ -303,7 +303,7 @@ class GoogleDriveDownloader(Downloader):
         return self.generate_download_response(file_data=file_data, download_path=download_path)
     @requires_dependencies(["googleapiclient"], extras="google-drive")
-    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         from googleapiclient.http import MediaIoBaseDownload
         logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")

unstructured_ingest/v2/processes/connectors/kafka/__init__.py CHANGED Viewed

@@ -1,13 +1,17 @@
 from __future__ import annotations
 from unstructured_ingest.v2.processes.connector_registry import (
+    add_destination_entry,
     add_source_entry,
 )
 from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
-from .cloud import kafka_cloud_source_entry
+from .cloud import kafka_cloud_destination_entry, kafka_cloud_source_entry
 from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
-from .local import kafka_local_source_entry
+from .local import kafka_local_destination_entry, kafka_local_source_entry
 add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
+add_destination_entry(destination_type=LOCAL_CONNECTOR, entry=kafka_local_destination_entry)
 add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
+add_destination_entry(destination_type=CLOUD_CONNECTOR, entry=kafka_cloud_destination_entry)

unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl