PyPI - unstructured-ingest - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (93) hide show

unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} RENAMED Viewed

@@ -2,6 +2,7 @@ import hashlib
 import json
 import sys
 import uuid
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
 from time import time
@@ -13,9 +14,11 @@ from unstructured_ingest.error import (
     DestinationConnectionError,
     SourceConnectionError,
     SourceConnectionNetworkError,
+    WriteError,
 )
 from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -26,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
+    SourceIdentifiers,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -116,19 +120,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
         return client_kwargs
     @requires_dependencies(["elasticsearch"], extras="elasticsearch")
-    def get_client(self) -> "ElasticsearchClient":
+    @contextmanager
+    def get_client(self) -> Generator["ElasticsearchClient", None, None]:
         from elasticsearch import Elasticsearch as ElasticsearchClient
-        client = ElasticsearchClient(**self.get_client_kwargs())
-        self.check_connection(client=client)
-        return client
-    def check_connection(self, client: "ElasticsearchClient"):
-        try:
-            client.perform_request("HEAD", "/", headers={"accept": "application/json"})
-        except Exception as e:
-            logger.error(f"failed to validate connection: {e}", exc_info=True)
-            raise SourceConnectionError(f"failed to validate connection: {e}")
+        with ElasticsearchClient(**self.get_client_kwargs()) as client:
+            yield client
 class ElasticsearchIndexerConfig(IndexerConfig):
@@ -144,7 +141,16 @@ class ElasticsearchIndexer(Indexer):
     def precheck(self) -> None:
         try:
-            self.connection_config.get_client()
+            with self.connection_config.get_client() as client:
+                if not client.ping():
+                    raise SourceConnectionError("cluster not detected")
+                indices = client.indices.get_alias(index="*")
+                if self.index_config.index_name not in indices:
+                    raise SourceConnectionError(
+                        "index {} not found: {}".format(
+                            self.index_config.index_name, ", ".join(indices.keys())
+                        )
+                    )
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -160,15 +166,15 @@ class ElasticsearchIndexer(Indexer):
         scan = self.load_scan()
         scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
-        client = self.connection_config.get_client()
-        hits = scan(
-            client,
-            query=scan_query,
-            scroll="1m",
-            index=self.index_config.index_name,
-        )
+        with self.connection_config.get_client() as client:
+            hits = scan(
+                client,
+                query=scan_query,
+                scroll="1m",
+                index=self.index_config.index_name,
+            )
-        return {hit["_id"] for hit in hits}
+            return {hit["_id"] for hit in hits}
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
         all_ids = self._get_doc_ids()
@@ -191,6 +197,7 @@ class ElasticsearchIndexer(Indexer):
             yield FileData(
                 identifier=identified,
                 connector_type=CONNECTOR_TYPE,
+                doc_type="batch",
                 metadata=FileDataSourceMetadata(
                     url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
                     date_processed=str(time()),
@@ -256,6 +263,7 @@ class ElasticsearchDownloader(Downloader):
             file_data=FileData(
                 identifier=filename_id,
                 connector_type=CONNECTOR_TYPE,
+                source_identifiers=SourceIdentifiers(filename=filename, fullpath=filename),
                 metadata=FileDataSourceMetadata(
                     version=str(result["_version"]) if "_version" in result else None,
                     date_processed=str(time()),
@@ -317,7 +325,7 @@ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
 class ElasticsearchUploadStager(UploadStager):
     upload_stager_config: ElasticsearchUploadStagerConfig
-    def conform_dict(self, data: dict) -> dict:
+    def conform_dict(self, data: dict, file_data: FileData) -> dict:
         resp = {
             "_index": self.upload_stager_config.index_name,
             "_id": str(uuid.uuid4()),
@@ -326,6 +334,7 @@ class ElasticsearchUploadStager(UploadStager):
                 "embeddings": data.pop("embeddings", None),
                 "text": data.pop("text", None),
                 "type": data.pop("type", None),
+                RECORD_ID_LABEL: file_data.identifier,
             },
         }
         if "metadata" in data and isinstance(data["metadata"], dict):
@@ -342,10 +351,17 @@ class ElasticsearchUploadStager(UploadStager):
     ) -> Path:
         with open(elements_filepath) as elements_file:
             elements_contents = json.load(elements_file)
-        conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
+        conformed_elements = [
+            self.conform_dict(data=element, file_data=file_data) for element in elements_contents
+        ]
+        if Path(output_filename).suffix != ".json":
+            output_filename = f"{output_filename}.json"
+        else:
+            output_filename = f"{Path(output_filename).stem}.json"
+        output_path = Path(output_dir) / output_filename
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file)
+            json.dump(conformed_elements, output_file, indent=2)
         return output_path
@@ -362,6 +378,10 @@ class ElasticsearchUploaderConfig(UploaderConfig):
     num_threads: int = Field(
         default=4, description="Number of threads to be used while uploading content"
     )
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
@@ -372,7 +392,16 @@ class ElasticsearchUploader(Uploader):
     def precheck(self) -> None:
         try:
-            self.connection_config.get_client()
+            with self.connection_config.get_client() as client:
+                if not client.ping():
+                    raise DestinationConnectionError("cluster not detected")
+                indices = client.indices.get_alias(index="*")
+                if self.upload_config.index_name not in indices:
+                    raise SourceConnectionError(
+                        "index {} not found: {}".format(
+                            self.upload_config.index_name, ", ".join(indices.keys())
+                        )
+                    )
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -383,6 +412,23 @@ class ElasticsearchUploader(Uploader):
         return parallel_bulk
+    def delete_by_record_id(self, client, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
+            f"from {self.upload_config.index_name} index"
+        )
+        delete_resp = client.delete_by_query(
+            index=self.upload_config.index_name,
+            body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
+        )
+        logger.info(
+            "deleted {} records from index {}".format(
+                delete_resp["deleted"], self.upload_config.index_name
+            )
+        )
+        if failures := delete_resp.get("failures"):
+            raise WriteError(f"failed to delete records: {failures}")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         parallel_bulk = self.load_parallel_bulk()
         with path.open("r") as file:
@@ -396,28 +442,29 @@ class ElasticsearchUploader(Uploader):
             f"{self.upload_config.num_threads} (number of) threads"
         )
-        client = self.connection_config.get_client()
-        if not client.indices.exists(index=self.upload_config.index_name):
-            logger.warning(
-                f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
-                f"{self.upload_config.index_name}. "
-                f"This may cause issues when uploading."
-            )
-        for batch in generator_batching_wbytes(
-            elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
-        ):
-            for success, info in parallel_bulk(
-                client=client,
-                actions=batch,
-                thread_count=self.upload_config.num_threads,
+        with self.connection_config.get_client() as client:
+            self.delete_by_record_id(client=client, file_data=file_data)
+            if not client.indices.exists(index=self.upload_config.index_name):
+                logger.warning(
+                    f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
+                    f"{self.upload_config.index_name}. "
+                    f"This may cause issues when uploading."
+                )
+            for batch in generator_batching_wbytes(
+                elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
             ):
-                if not success:
-                    logger.error(
-                        "upload failed for a batch in "
-                        f"{(self.__class__.__name__).replace('Uploader', '')} "
-                        "destination connector:",
-                        info,
-                    )
+                for success, info in parallel_bulk(
+                    client=client,
+                    actions=batch,
+                    thread_count=self.upload_config.num_threads,
+                ):
+                    if not success:
+                        logger.error(
+                            "upload failed for a batch in "
+                            f"{(self.__class__.__name__).replace('Uploader', '')} "
+                            "destination connector:",
+                            info,
+                        )
 elasticsearch_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} RENAMED Viewed

@@ -17,7 +17,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
 )
-from unstructured_ingest.v2.processes.connectors.elasticsearch import (
+from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
     ElasticsearchDownloader,
     ElasticsearchDownloaderConfig,
     ElasticsearchIndexer,

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from __future__ import annotations
+import os
 import random
+import shutil
+import tempfile
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -207,12 +210,35 @@ class FsspecDownloader(Downloader):
             **self.connection_config.get_access_config(),
         )
+    def handle_directory_download(self, lpath: Path) -> None:
+        # If the object's name contains certain characters (i.e. '?'), it
+        # gets downloaded into a new directory of the same name. This
+        # reconciles that with what is expected, which is to download it
+        # as a file that is not within a directory.
+        if not lpath.is_dir():
+            return
+        desired_name = lpath.name
+        files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
+        if not files_in_dir:
+            raise ValueError(f"no files in {lpath}")
+        if len(files_in_dir) > 1:
+            raise ValueError(
+                "Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
+            )
+        file = files_in_dir[0]
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_location = os.path.join(temp_dir, desired_name)
+            shutil.copyfile(src=file, dst=temp_location)
+            shutil.rmtree(lpath)
+            shutil.move(src=temp_location, dst=lpath)
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         try:
             rpath = file_data.additional_metadata["original_file_path"]
             self.fs.get(rpath=rpath, lpath=download_path.as_posix())
+            self.handle_directory_download(lpath=download_path)
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -224,6 +250,7 @@ class FsspecDownloader(Downloader):
         try:
             rpath = file_data.additional_metadata["original_file_path"]
             await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
+            self.handle_directory_download(lpath=download_path)
         except Exception as e:
             logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
             raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -19,12 +19,12 @@ from unstructured_ingest.v2.interfaces import (
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
+    DownloadResponse,
     FileData,
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
@@ -294,7 +294,7 @@ class GoogleDriveDownloader(Downloader):
             _, downloaded = downloader.next_chunk()
         return downloaded
-    def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
+    def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
         logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
@@ -303,7 +303,7 @@ class GoogleDriveDownloader(Downloader):
         return self.generate_download_response(file_data=file_data, download_path=download_path)
     @requires_dependencies(["googleapiclient"], extras="google-drive")
-    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         from googleapiclient.http import MediaIoBaseDownload
         logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")

unstructured_ingest/v2/processes/connectors/kafka/__init__.py CHANGED Viewed

@@ -1,13 +1,17 @@
 from __future__ import annotations
 from unstructured_ingest.v2.processes.connector_registry import (
+    add_destination_entry,
     add_source_entry,
 )
 from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
-from .cloud import kafka_cloud_source_entry
+from .cloud import kafka_cloud_destination_entry, kafka_cloud_source_entry
 from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
-from .local import kafka_local_source_entry
+from .local import kafka_local_destination_entry, kafka_local_source_entry
 add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
+add_destination_entry(destination_type=LOCAL_CONNECTOR, entry=kafka_local_destination_entry)
 add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
+add_destination_entry(destination_type=CLOUD_CONNECTOR, entry=kafka_cloud_destination_entry)

unstructured_ingest/v2/processes/connectors/kafka/cloud.py CHANGED Viewed

@@ -4,7 +4,10 @@ from typing import TYPE_CHECKING, Optional
 from pydantic import Field, Secret, SecretStr
-from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
+from unstructured_ingest.v2.processes.connector_registry import (
+    DestinationRegistryEntry,
+    SourceRegistryEntry,
+)
 from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
     KafkaAccessConfig,
     KafkaConnectionConfig,
@@ -12,6 +15,8 @@ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
     KafkaDownloaderConfig,
     KafkaIndexer,
     KafkaIndexerConfig,
+    KafkaUploader,
+    KafkaUploaderConfig,
 )
 if TYPE_CHECKING:
@@ -41,7 +46,21 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
             "group.id": "default_group_id",
             "enable.auto.commit": "false",
             "auto.offset.reset": "earliest",
-            "message.max.bytes": 10485760,
+            "sasl.username": access_config.api_key,
+            "sasl.password": access_config.secret,
+            "sasl.mechanism": "PLAIN",
+            "security.protocol": "SASL_SSL",
+        }
+        return conf
+    def get_producer_configuration(self) -> dict:
+        bootstrap = self.bootstrap_server
+        port = self.port
+        access_config = self.access_config.get_secret_value()
+        conf = {
+            "bootstrap.servers": f"{bootstrap}:{port}",
             "sasl.username": access_config.api_key,
             "sasl.password": access_config.secret,
             "sasl.mechanism": "PLAIN",
@@ -73,6 +92,17 @@ class CloudKafkaDownloader(KafkaDownloader):
     connector_type: str = CONNECTOR_TYPE
+class CloudKafkaUploaderConfig(KafkaUploaderConfig):
+    pass
+@dataclass
+class CloudKafkaUploader(KafkaUploader):
+    connection_config: CloudKafkaConnectionConfig
+    upload_config: CloudKafkaUploaderConfig
+    connector_type: str = CONNECTOR_TYPE
 kafka_cloud_source_entry = SourceRegistryEntry(
     connection_config=CloudKafkaConnectionConfig,
     indexer=CloudKafkaIndexer,
@@ -80,3 +110,9 @@ kafka_cloud_source_entry = SourceRegistryEntry(
     downloader=CloudKafkaDownloader,
     downloader_config=CloudKafkaDownloaderConfig,
 )
+kafka_cloud_destination_entry = DestinationRegistryEntry(
+    connection_config=CloudKafkaConnectionConfig,
+    uploader=CloudKafkaUploader,
+    uploader_config=CloudKafkaUploaderConfig,
+)

unstructured_ingest/v2/processes/connectors/kafka/kafka.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, field
@@ -5,32 +6,33 @@ from pathlib import Path
 from time import time
 from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
-from pydantic import Secret
+from pydantic import Field, Secret
 from unstructured_ingest.error import (
+    DestinationConnectionError,
     SourceConnectionError,
     SourceConnectionNetworkError,
 )
+from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
+    DownloadResponse,
     FileData,
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    download_responses,
+    Uploader,
+    UploaderConfig,
 )
 from unstructured_ingest.v2.logger import logger
-from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
 if TYPE_CHECKING:
-    from confluent_kafka import Consumer
-CONNECTOR_TYPE = "kafka"
+    from confluent_kafka import Consumer, Producer
 class KafkaAccessConfig(AccessConfig, ABC):
@@ -39,7 +41,6 @@ class KafkaAccessConfig(AccessConfig, ABC):
 class KafkaConnectionConfig(ConnectionConfig, ABC):
     access_config: Secret[KafkaAccessConfig]
-    timeout: Optional[float] = 1.0
     bootstrap_server: str
     port: int
@@ -47,6 +48,10 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
     def get_consumer_configuration(self) -> dict:
         pass
+    @abstractmethod
+    def get_producer_configuration(self) -> dict:
+        pass
     @contextmanager
     @requires_dependencies(["confluent_kafka"], extras="kafka")
     def get_consumer(self) -> ContextManager["Consumer"]:
@@ -59,20 +64,27 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
         finally:
             consumer.close()
+    @requires_dependencies(["confluent_kafka"], extras="kafka")
+    def get_producer(self) -> "Producer":
+        from confluent_kafka import Producer
+        producer = Producer(self.get_producer_configuration())
+        return producer
 class KafkaIndexerConfig(IndexerConfig):
-    topic: str
+    topic: str = Field(description="which topic to consume from")
     num_messages_to_consume: Optional[int] = 100
+    timeout: Optional[float] = Field(default=1.0, description="polling timeout")
     def update_consumer(self, consumer: "Consumer") -> None:
         consumer.subscribe([self.topic])
 @dataclass
-class KafkaIndexer(Indexer):
+class KafkaIndexer(Indexer, ABC):
     connection_config: KafkaConnectionConfig
     index_config: KafkaIndexerConfig
-    connector_type: str = CONNECTOR_TYPE
     @contextmanager
     def get_consumer(self) -> ContextManager["Consumer"]:
@@ -90,7 +102,7 @@ class KafkaIndexer(Indexer):
         num_messages_to_consume = self.index_config.num_messages_to_consume
         with self.get_consumer() as consumer:
             while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
-                msg = consumer.poll(timeout=self.connection_config.timeout)
+                msg = consumer.poll(timeout=self.index_config.timeout)
                 if msg is None:
                     logger.debug("No Kafka messages found")
                     empty_polls += 1
@@ -139,16 +151,22 @@ class KafkaIndexer(Indexer):
         for message in self.generate_messages():
             yield self.generate_file_data(message)
-    async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
+    async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         raise NotImplementedError()
     def precheck(self):
         try:
             with self.get_consumer() as consumer:
-                cluster_meta = consumer.list_topics(timeout=self.connection_config.timeout)
+                cluster_meta = consumer.list_topics(timeout=self.index_config.timeout)
                 current_topics = [
                     topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
                 ]
+                if self.index_config.topic not in current_topics:
+                    raise SourceConnectionError(
+                        "expected topic {} not detected in cluster: {}".format(
+                            self.index_config.topic, ", ".join(current_topics)
+                        )
+                    )
                 logger.info(f"successfully checked available topics: {current_topics}")
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
@@ -160,14 +178,13 @@ class KafkaDownloaderConfig(DownloaderConfig):
 @dataclass
-class KafkaDownloader(Downloader):
+class KafkaDownloader(Downloader, ABC):
     connection_config: KafkaConnectionConfig
     download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
-    connector_type: str = CONNECTOR_TYPE
     version: Optional[str] = None
     source_url: Optional[str] = None
-    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         source_identifiers = file_data.source_identifiers
         if source_identifiers is None:
             raise ValueError("FileData is missing source_identifiers")
@@ -187,10 +204,54 @@ class KafkaDownloader(Downloader):
         return self.generate_download_response(file_data=file_data, download_path=download_path)
-kafka_source_entry = SourceRegistryEntry(
-    connection_config=KafkaConnectionConfig,
-    indexer=KafkaIndexer,
-    indexer_config=KafkaIndexerConfig,
-    downloader=KafkaDownloader,
-    downloader_config=KafkaDownloaderConfig,
-)
+class KafkaUploaderConfig(UploaderConfig):
+    batch_size: int = Field(default=100, description="Batch size")
+    topic: str = Field(description="which topic to write to")
+    timeout: Optional[float] = Field(
+        default=10.0, description="Timeout in seconds to flush batch of messages"
+    )
+@dataclass
+class KafkaUploader(Uploader, ABC):
+    connection_config: KafkaConnectionConfig
+    upload_config: KafkaUploaderConfig
+    def precheck(self):
+        try:
+            with self.connection_config.get_consumer() as consumer:
+                cluster_meta = consumer.list_topics(timeout=self.upload_config.timeout)
+                current_topics = [
+                    topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
+                ]
+                logger.info(f"successfully checked available topics: {current_topics}")
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def produce_batch(self, elements: list[dict]) -> None:
+        from confluent_kafka.error import KafkaException
+        producer = self.connection_config.get_producer()
+        failed_producer = False
+        def acked(err, msg):
+            if err is not None:
+                logger.error("Failed to deliver message: %s: %s" % (str(msg), str(err)))
+        for element in elements:
+            producer.produce(
+                topic=self.upload_config.topic,
+                value=json.dumps(element),
+                callback=acked,
+            )
+        producer.flush(timeout=self.upload_config.timeout)
+        if failed_producer:
+            raise KafkaException("failed to produce all messages in batch")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        with path.open("r") as elements_file:
+            elements = json.load(elements_file)
+        for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
+            self.produce_batch(elements=element_batch)

unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl