PyPI - unstructured-ingest - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

unstructured-ingest 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (53) hide show

unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py CHANGED Viewed

@@ -3,12 +3,12 @@ from typing import Optional
 from pydantic import Field, Secret
-from unstructured_ingest.v2.interfaces import AccessConfig
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
 )
 from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
+    DatabricksVolumesAccessConfig,
     DatabricksVolumesConnectionConfig,
     DatabricksVolumesDownloader,
     DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
 CONNECTOR_TYPE = "databricks_volumes_gcp"
-class DatabricksGoogleVolumesAccessConfig(AccessConfig):
+class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
         description="The Databricks account ID for the Databricks " "accounts endpoint.",

unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py CHANGED Viewed

@@ -3,12 +3,12 @@ from typing import Optional
 from pydantic import Field, Secret
-from unstructured_ingest.v2.interfaces import AccessConfig
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
 )
 from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
+    DatabricksVolumesAccessConfig,
     DatabricksVolumesConnectionConfig,
     DatabricksVolumesDownloader,
     DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
 CONNECTOR_TYPE = "databricks_volumes"
-class DatabricksNativeVolumesAccessConfig(AccessConfig):
+class DatabricksNativeVolumesAccessConfig(DatabricksVolumesAccessConfig):
     client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
     client_secret: Optional[str] = Field(
         default=None, description="Client Secret of the OAuth app."

unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py CHANGED Viewed

@@ -255,6 +255,7 @@ class ElasticsearchDownloader(Downloader):
                 exc_info=True,
             )
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
         cast_file_data = FileData.cast(file_data=file_data)
         cast_file_data.identifier = filename_id
         cast_file_data.metadata.date_processed = str(time())
@@ -264,7 +265,6 @@ class ElasticsearchDownloader(Downloader):
             "index_name": index_name,
             "document_id": record_id,
         }
-        cast_file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
         return super().generate_download_response(
             file_data=cast_file_data,
             download_path=download_path,

unstructured_ingest/v2/processes/connectors/kafka/cloud.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from pydantic import Field, Secret, SecretStr
+from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -50,6 +51,7 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
             "sasl.password": access_config.secret.get_secret_value(),
             "sasl.mechanism": "PLAIN",
             "security.protocol": "SASL_SSL",
+            "logger": logger,
         }
         return conf
@@ -61,10 +63,11 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
         conf = {
             "bootstrap.servers": f"{bootstrap}:{port}",
-            "sasl.username": access_config.kafka_api_key,
-            "sasl.password": access_config.secret,
+            "sasl.username": access_config.kafka_api_key.get_secret_value(),
+            "sasl.password": access_config.secret.get_secret_value(),
             "sasl.mechanism": "PLAIN",
             "security.protocol": "SASL_SSL",
+            "logger": logger,
         }
         return conf

unstructured_ingest/v2/processes/connectors/kafka/kafka.py CHANGED Viewed

@@ -170,7 +170,7 @@ class KafkaIndexer(Indexer, ABC):
                 ]
                 if self.index_config.topic not in current_topics:
                     raise SourceConnectionError(
-                        "expected topic {} not detected in cluster: {}".format(
+                        "expected topic '{}' not detected in cluster: '{}'".format(
                             self.index_config.topic, ", ".join(current_topics)
                         )
                     )
@@ -232,6 +232,13 @@ class KafkaUploader(Uploader, ABC):
                     topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
                 ]
                 logger.info(f"successfully checked available topics: {current_topics}")
+                if self.upload_config.topic not in current_topics:
+                    raise DestinationConnectionError(
+                        "expected topic '{}' not detected in cluster: '{}'".format(
+                            self.upload_config.topic, ", ".join(current_topics)
+                        )
+                    )
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -243,8 +250,10 @@ class KafkaUploader(Uploader, ABC):
         failed_producer = False
         def acked(err, msg):
+            nonlocal failed_producer
             if err is not None:
-                logger.error("Failed to deliver message: %s: %s" % (str(msg), str(err)))
+                failed_producer = True
+                logger.error("Failed to deliver kafka message: %s: %s" % (str(msg), str(err)))
         for element in elements:
             producer.produce(
@@ -253,7 +262,9 @@ class KafkaUploader(Uploader, ABC):
                 callback=acked,
             )
-        producer.flush(timeout=self.upload_config.timeout)
+        while producer_len := len(producer):
+            logger.debug(f"another iteration of kafka producer flush. Queue length: {producer_len}")
+            producer.flush(timeout=self.upload_config.timeout)
         if failed_producer:
             raise KafkaException("failed to produce all messages in batch")

unstructured_ingest/v2/processes/connectors/milvus.py CHANGED Viewed

@@ -156,11 +156,18 @@ class MilvusUploader(Uploader):
     @DestinationConnectionError.wrap
     def precheck(self):
-        with self.get_client() as client:
-            if not client.has_collection(self.upload_config.collection_name):
-                raise DestinationConnectionError(
-                    f"Collection '{self.upload_config.collection_name}' does not exist"
-                )
+        from pymilvus import MilvusException
+        try:
+            with self.get_client() as client:
+                if not client.has_collection(self.upload_config.collection_name):
+                    raise DestinationConnectionError(
+                        f"Collection '{self.upload_config.collection_name}' does not exist"
+                    )
+        except MilvusException as milvus_exception:
+            raise DestinationConnectionError(
+                f"failed to precheck Milvus: {str(milvus_exception.message)}"
+            ) from milvus_exception
     @contextmanager
     def get_client(self) -> Generator["MilvusClient", None, None]:
@@ -197,7 +204,9 @@ class MilvusUploader(Uploader):
             try:
                 res = client.insert(collection_name=self.upload_config.collection_name, data=data)
             except MilvusException as milvus_exception:
-                raise WriteError("failed to upload records to milvus") from milvus_exception
+                raise WriteError(
+                    f"failed to upload records to Milvus: {str(milvus_exception.message)}"
+                ) from milvus_exception
             if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
                 err_count = res["err_count"]
                 raise WriteError(f"failed to upload {err_count} docs")

unstructured_ingest/v2/processes/connectors/mongodb.py CHANGED Viewed

@@ -198,14 +198,13 @@ class MongoDBDownloader(Downloader):
         concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
         # Create a FileData object for each document with source_identifiers
-        cast_file_data = FileData.cast(file_data=file_data)
-        cast_file_data.identifier = str(doc_id)
         filename = f"{doc_id}.txt"
-        cast_file_data.source_identifiers = SourceIdentifiers(
+        file_data.source_identifiers = SourceIdentifiers(
             filename=filename,
             fullpath=filename,
-            rel_path=filename,
         )
+        cast_file_data = FileData.cast(file_data=file_data)
+        cast_file_data.identifier = str(doc_id)
         # Determine the download path
         download_path = self.get_download_path(file_data=cast_file_data)

unstructured_ingest/v2/processes/connectors/neo4j.py CHANGED Viewed

@@ -378,6 +378,8 @@ class Neo4jUploader(Uploader):
 neo4j_destination_entry = DestinationRegistryEntry(
     connection_config=Neo4jConnectionConfig,
+    upload_stager=Neo4jUploadStager,
+    upload_stager_config=Neo4jUploadStagerConfig,
     uploader=Neo4jUploader,
     uploader_config=Neo4jUploaderConfig,
 )

unstructured_ingest/v2/processes/connectors/onedrive.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from __future__ import annotations
+import asyncio
 import json
 from dataclasses import dataclass
 from pathlib import Path
 from time import time
-from typing import TYPE_CHECKING, Any, Generator, Optional
+from typing import TYPE_CHECKING, Any, AsyncIterator, Generator, Iterator, Optional, TypeVar
 from dateutil import parser
 from pydantic import Field, Secret
@@ -100,6 +101,27 @@ class OnedriveIndexerConfig(IndexerConfig):
     recursive: bool = False
+T = TypeVar("T")
+def async_iterable_to_sync_iterable(iterator: AsyncIterator[T]) -> Iterator[T]:
+    # This version works on Python 3.9 by manually handling the async iteration.
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        while True:
+            try:
+                # Instead of anext(iterator), we directly call __anext__().
+                # __anext__ returns a coroutine that we must run until complete.
+                future = iterator.__anext__()
+                result = loop.run_until_complete(future)
+                yield result
+            except StopAsyncIteration:
+                break
+    finally:
+        loop.close()
 @dataclass
 class OnedriveIndexer(Indexer):
     connection_config: OnedriveConnectionConfig
@@ -116,17 +138,21 @@ class OnedriveIndexer(Indexer):
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise SourceConnectionError(f"failed to validate connection: {e}")
-    def list_objects(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
+    def list_objects_sync(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
         drive_items = folder.children.get().execute_query()
         files = [d for d in drive_items if d.is_file]
         if not recursive:
             return files
         folders = [d for d in drive_items if d.is_folder]
         for f in folders:
-            files.extend(self.list_objects(f, recursive))
+            files.extend(self.list_objects_sync(f, recursive))
         return files
-    def get_root(self, client: "GraphClient") -> "DriveItem":
+    async def list_objects(self, folder: "DriveItem", recursive: bool) -> list["DriveItem"]:
+        return await asyncio.to_thread(self.list_objects_sync, folder, recursive)
+    def get_root_sync(self, client: "GraphClient") -> "DriveItem":
         root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
         if fpath := self.index_config.path:
             root = root.get_by_path(fpath).get().execute_query()
@@ -134,7 +160,10 @@ class OnedriveIndexer(Indexer):
                 raise ValueError(f"Unable to find directory, given: {fpath}")
         return root
-    def get_properties(self, drive_item: "DriveItem") -> dict:
+    async def get_root(self, client: "GraphClient") -> "DriveItem":
+        return await asyncio.to_thread(self.get_root_sync, client)
+    def get_properties_sync(self, drive_item: "DriveItem") -> dict:
         properties = drive_item.properties
         filtered_properties = {}
         for k, v in properties.items():
@@ -145,7 +174,10 @@ class OnedriveIndexer(Indexer):
                 pass
         return filtered_properties
-    def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:
+    async def get_properties(self, drive_item: "DriveItem") -> dict:
+        return await asyncio.to_thread(self.get_properties_sync, drive_item)
+    def drive_item_to_file_data_sync(self, drive_item: "DriveItem") -> FileData:
         file_path = drive_item.parent_reference.path.split(":")[-1]
         file_path = file_path[1:] if file_path and file_path[0] == "/" else file_path
         filename = drive_item.name
@@ -176,17 +208,34 @@ class OnedriveIndexer(Indexer):
                     "server_relative_path": server_path,
                 },
             ),
-            additional_metadata=self.get_properties(drive_item=drive_item),
+            additional_metadata=self.get_properties_sync(drive_item=drive_item),
         )
-    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
-        client = self.connection_config.get_client()
-        root = self.get_root(client=client)
-        drive_items = self.list_objects(folder=root, recursive=self.index_config.recursive)
+    async def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:
+        # Offload the file data creation if it's not guaranteed async
+        return await asyncio.to_thread(self.drive_item_to_file_data_sync, drive_item)
+    async def _run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
+        token_resp = await asyncio.to_thread(self.connection_config.get_token)
+        if "error" in token_resp:
+            raise SourceConnectionError(
+                f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
+            )
+        client = await asyncio.to_thread(self.connection_config.get_client)
+        root = await self.get_root(client=client)
+        drive_items = await self.list_objects(folder=root, recursive=self.index_config.recursive)
         for drive_item in drive_items:
-            file_data = self.drive_item_to_file_data(drive_item=drive_item)
+            file_data = await self.drive_item_to_file_data(drive_item=drive_item)
             yield file_data
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        # Convert the async generator to a sync generator without loading all data into memory
+        async_gen = self._run_async(**kwargs)
+        for item in async_iterable_to_sync_iterable(async_gen):
+            yield item
 class OnedriveDownloaderConfig(DownloaderConfig):
     pass
@@ -220,19 +269,24 @@ class OnedriveDownloader(Downloader):
     @SourceConnectionError.wrap
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
-        file = self._fetch_file(file_data=file_data)
-        fsize = file.get_property("size", 0)
-        download_path = self.get_download_path(file_data=file_data)
-        download_path.parent.mkdir(parents=True, exist_ok=True)
-        logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
-        if fsize > MAX_MB_SIZE:
-            logger.info(f"downloading file with size: {fsize} bytes in chunks")
-            with download_path.open(mode="wb") as f:
-                file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
-        else:
-            with download_path.open(mode="wb") as f:
-                file.download(f).execute_query()
-        return self.generate_download_response(file_data=file_data, download_path=download_path)
+        try:
+            file = self._fetch_file(file_data=file_data)
+            fsize = file.get_property("size", 0)
+            download_path = self.get_download_path(file_data=file_data)
+            download_path.parent.mkdir(parents=True, exist_ok=True)
+            logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
+            if fsize > MAX_MB_SIZE:
+                logger.info(f"downloading file with size: {fsize} bytes in chunks")
+                with download_path.open(mode="wb") as f:
+                    file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
+            else:
+                with download_path.open(mode="wb") as f:
+                    file.download(f).execute_query()
+            return self.generate_download_response(file_data=file_data, download_path=download_path)
+        except Exception as e:
+            logger.error(f"[{CONNECTOR_TYPE}] Exception during downloading: {e}", exc_info=True)
+            # Re-raise to see full stack trace locally
+            raise
 class OnedriveUploaderConfig(UploaderConfig):

unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py CHANGED Viewed

@@ -128,7 +128,6 @@ class QdrantUploader(Uploader, ABC):
         file_data: FileData,
         **kwargs: Any,
     ) -> None:
         batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
         logger.debug(
             "Elements split into %i batches of size %i.",

unstructured_ingest/v2/processes/connectors/redisdb.py ADDED Viewed

@@ -0,0 +1,182 @@
+import json
+from contextlib import asynccontextmanager, contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
+from pydantic import Field, Secret, model_validator
+from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.data_prep import batch_generator
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    FileData,
+    Uploader,
+    UploaderConfig,
+)
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
+if TYPE_CHECKING:
+    from redis.asyncio import Redis
+import asyncio
+CONNECTOR_TYPE = "redis"
+SERVER_API_VERSION = "1"
+class RedisAccessConfig(AccessConfig):
+    uri: Optional[str] = Field(
+        default=None, description="If not anonymous, use this uri, if specified."
+    )
+    password: Optional[str] = Field(
+        default=None, description="If not anonymous, use this password, if specified."
+    )
+class RedisConnectionConfig(ConnectionConfig):
+    access_config: Secret[RedisAccessConfig] = Field(
+        default=RedisAccessConfig(), validate_default=True
+    )
+    host: Optional[str] = Field(
+        default=None, description="Hostname or IP address of a Redis instance to connect to."
+    )
+    database: int = Field(default=0, description="Database index to connect to.")
+    port: int = Field(default=6379, description="port used to connect to database.")
+    username: Optional[str] = Field(
+        default=None, description="Username used to connect to database."
+    )
+    ssl: bool = Field(default=True, description="Whether the connection should use SSL encryption.")
+    connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
+    @model_validator(mode="after")
+    def validate_host_or_url(self) -> "RedisConnectionConfig":
+        if not self.access_config.get_secret_value().uri and not self.host:
+            raise ValueError("Please pass a hostname either directly or through uri")
+        return self
+    @requires_dependencies(["redis"], extras="redis")
+    @asynccontextmanager
+    async def create_async_client(self) -> AsyncGenerator["Redis", None]:
+        from redis.asyncio import Redis, from_url
+        access_config = self.access_config.get_secret_value()
+        options = {
+            "host": self.host,
+            "port": self.port,
+            "db": self.database,
+            "ssl": self.ssl,
+            "username": self.username,
+        }
+        if access_config.password:
+            options["password"] = access_config.password
+        if access_config.uri:
+            async with from_url(access_config.uri) as client:
+                yield client
+        else:
+            async with Redis(**options) as client:
+                yield client
+    @requires_dependencies(["redis"], extras="redis")
+    @contextmanager
+    def create_client(self) -> Generator["Redis", None, None]:
+        from redis import Redis, from_url
+        access_config = self.access_config.get_secret_value()
+        options = {
+            "host": self.host,
+            "port": self.port,
+            "db": self.database,
+            "ssl": self.ssl,
+            "username": self.username,
+        }
+        if access_config.password:
+            options["password"] = access_config.password
+        if access_config.uri:
+            with from_url(access_config.uri) as client:
+                yield client
+        else:
+            with Redis(**options) as client:
+                yield client
+class RedisUploaderConfig(UploaderConfig):
+    batch_size: int = Field(default=100, description="Number of records per batch")
+@dataclass
+class RedisUploader(Uploader):
+    upload_config: RedisUploaderConfig
+    connection_config: RedisConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    def is_async(self) -> bool:
+        return True
+    def precheck(self) -> None:
+        try:
+            with self.connection_config.create_client() as client:
+                client.ping()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        first_element = data[0]
+        redis_stack = await self._check_redis_stack(first_element)
+        logger.info(
+            f"writing {len(data)} objects to destination asynchronously, "
+            f"db, {self.connection_config.database}, "
+            f"at {self.connection_config.host}",
+        )
+        batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
+        await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
+    async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
+        async with self.connection_config.create_async_client() as async_client:
+            async with async_client.pipeline(transaction=True) as pipe:
+                for element in batch:
+                    element_id = element["element_id"]
+                    if redis_stack:
+                        pipe.json().set(element_id, "$", element)
+                    else:
+                        pipe.set(element_id, json.dumps(element))
+                await pipe.execute()
+    @requires_dependencies(["redis"], extras="redis")
+    async def _check_redis_stack(self, element: dict) -> bool:
+        from redis import exceptions as redis_exceptions
+        redis_stack = True
+        async with self.connection_config.create_async_client() as async_client:
+            async with async_client.pipeline(transaction=True) as pipe:
+                element_id = element["element_id"]
+                try:
+                    # Redis with stack extension supports JSON type
+                    await pipe.json().set(element_id, "$", element).execute()
+                except redis_exceptions.ResponseError as e:
+                    message = str(e)
+                    if "unknown command `JSON.SET`" in message:
+                        # if this error occurs, Redis server doesn't support JSON type,
+                        # so save as string type instead
+                        await pipe.set(element_id, json.dumps(element)).execute()
+                        redis_stack = False
+                    else:
+                        raise e
+        return redis_stack
+redis_destination_entry = DestinationRegistryEntry(
+    connection_config=RedisConnectionConfig,
+    uploader=RedisUploader,
+    uploader_config=RedisUploaderConfig,
+)

unstructured_ingest/v2/processes/connectors/sql/sql.py CHANGED Viewed

@@ -28,6 +28,7 @@ from unstructured_ingest.v2.interfaces import (
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
+    SourceIdentifiers,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -218,6 +219,10 @@ class SQLDownloader(Downloader, ABC):
         )
         download_path.parent.mkdir(parents=True, exist_ok=True)
         result.to_csv(download_path, index=False)
+        file_data.source_identifiers = SourceIdentifiers(
+            filename=filename,
+            fullpath=filename,
+        )
         cast_file_data = FileData.cast(file_data=file_data)
         cast_file_data.identifier = filename_id
         return super().generate_download_response(

unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl