PyPI - unstructured-ingest - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

unstructured-ingest 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (55) hide show

test/integration/connectors/weaviate/test_local.py ADDED Viewed

@@ -0,0 +1,131 @@
+import json
+import time
+from pathlib import Path
+import pytest
+import requests
+import weaviate
+from weaviate.client import WeaviateClient
+from test.integration.connectors.utils.constants import DESTINATION_TAG
+from test.integration.connectors.utils.docker import container_context
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.weaviate.local import (
+    CONNECTOR_TYPE,
+    LocalWeaviateConnectionConfig,
+    LocalWeaviateUploader,
+    LocalWeaviateUploaderConfig,
+    LocalWeaviateUploadStager,
+)
+COLLECTION_NAME = "elements"
+def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            requests.get("http://localhost:8080/v1/.well-known/read")
+            return
+        except Exception as e:
+            print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
+            time.sleep(interval)
+    raise TimeoutError("Docker container never came up healthy")
+@pytest.fixture
+def collection(collections_schema_config: dict) -> str:
+    with container_context(
+        image="semitechnologies/weaviate:1.27.3",
+        ports={8080: 8080, 50051: 50051},
+    ):
+        wait_for_container()
+        with weaviate.connect_to_local() as weaviate_client:
+            weaviate_client.collections.create_from_dict(config=collections_schema_config)
+        yield COLLECTION_NAME
+def get_count(client: WeaviateClient) -> int:
+    collection = client.collections.get(COLLECTION_NAME)
+    resp = collection.aggregate.over_all(total_count=True)
+    return resp.total_count
+def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
+    with weaviate.connect_to_local() as weaviate_client:
+        current_count = get_count(client=weaviate_client)
+        retry_count = 0
+        while current_count != expected_count and retry_count < retries:
+            retry_count += 1
+            time.sleep(interval)
+            current_count = get_count(client=weaviate_client)
+        assert current_count == expected_count, (
+            f"Expected count ({expected_count}) doesn't match how "
+            f"much came back from collection: {current_count}"
+        )
+def run_uploader_and_validate(
+    uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
+):
+    uploader.precheck()
+    uploader.run(path=path, file_data=file_data)
+    validate_count(expected_count=expected_count)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    stager = LocalWeaviateUploadStager()
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    dynamic_uploader = LocalWeaviateUploader(
+        upload_config=LocalWeaviateUploaderConfig(
+            collection=COLLECTION_NAME,
+        ),
+        connection_config=LocalWeaviateConnectionConfig(),
+    )
+    fixed_size_uploader = LocalWeaviateUploader(
+        upload_config=LocalWeaviateUploaderConfig(
+            collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
+        ),
+        connection_config=LocalWeaviateConnectionConfig(),
+    )
+    rate_limited_uploader = LocalWeaviateUploader(
+        upload_config=LocalWeaviateUploaderConfig(
+            collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
+        ),
+        connection_config=LocalWeaviateConnectionConfig(),
+    )
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    run_uploader_and_validate(
+        uploader=dynamic_uploader,
+        path=staged_filepath,
+        file_data=file_data,
+        expected_count=expected_count,
+    )
+    run_uploader_and_validate(
+        uploader=fixed_size_uploader,
+        path=staged_filepath,
+        file_data=file_data,
+        expected_count=expected_count,
+    )
+    run_uploader_and_validate(
+        uploader=rate_limited_uploader,
+        path=staged_filepath,
+        file_data=file_data,
+        expected_count=expected_count,
+    )

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.0" # pragma: no cover
1	+ __version__ = "0.3.2" # pragma: no cover

unstructured_ingest/pipeline/reformat/embedding.py CHANGED Viewed

@@ -61,4 +61,4 @@ class Embedder(ReformatNode):
             return None
     def get_path(self) -> Path:
-        return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
+        return (Path(self.pipeline_context.work_dir) / "embedded.py").resolve()

unstructured_ingest/utils/data_prep.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import itertools
 import json
 from datetime import datetime
-from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
+from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
+import pandas as pd
 DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -9,6 +11,12 @@ T = TypeVar("T")
 IterableT = Iterable[T]
+def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
+    num_chunks = len(df) // chunk_size + 1
+    for i in range(num_chunks):
+        yield df[i * chunk_size : (i + 1) * chunk_size]
 def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
     """A helper function to break an iterable into batches of size batch_size."""
     it = iter(iterable)

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from __future__ import annotations
 import unstructured_ingest.v2.processes.connectors.databricks  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.elasticsearch  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.fsspec  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.kafka  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.lancedb  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.qdrant  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.sql  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.weaviate  # noqa: F401
 from unstructured_ingest.v2.processes.connector_registry import (
     add_destination_entry,
     add_source_entry,
@@ -24,8 +27,6 @@ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
 from .couchbase import couchbase_destination_entry, couchbase_source_entry
 from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
 from .delta_table import delta_table_destination_entry
-from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
-from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
 from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
 from .gitlab import gitlab_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -40,8 +41,6 @@ from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
 from .mongodb import mongodb_destination_entry, mongodb_source_entry
 from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
 from .onedrive import onedrive_destination_entry, onedrive_source_entry
-from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
-from .opensearch import opensearch_destination_entry, opensearch_source_entry
 from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
 from .outlook import outlook_source_entry
 from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
@@ -52,8 +51,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
 from .sharepoint import sharepoint_source_entry
 from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
 from .slack import slack_source_entry
-from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
-from .weaviate import weaviate_destination_entry
 add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
 add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -67,10 +64,6 @@ add_destination_entry(
     destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
 )
-add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
-add_destination_entry(
-    destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
-)
 add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
@@ -80,15 +73,9 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
 add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
 add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
-add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
-add_destination_entry(
-    destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
-)
 add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
-add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
 add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
 add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -170,7 +170,7 @@ class AstraDBIndexer(Indexer):
     def precheck(self) -> None:
         try:
-            self.get_collection()
+            self.get_collection().options()
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -345,7 +345,7 @@ class AstraDBUploader(Uploader):
                 connection_config=self.connection_config,
                 collection_name=self.upload_config.collection_name,
                 keyspace=self.upload_config.keyspace,
-            )
+            ).options()
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")

unstructured_ingest/v2/processes/connectors/azure_ai_search.py CHANGED Viewed

@@ -155,6 +155,10 @@ class AzureAISearchUploadStager(UploadStager):
             self.conform_dict(data=element, file_data=file_data) for element in elements_contents
         ]
+        if Path(output_filename).suffix != ".json":
+            output_filename = f"{output_filename}.json"
+        else:
+            output_filename = f"{Path(output_filename).stem}.json"
         output_path = Path(output_dir) / Path(f"{output_filename}.json")
         output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:

unstructured_ingest/v2/processes/connectors/delta_table.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import os
+import traceback
 from dataclasses import dataclass, field
-from multiprocessing import Process
+from multiprocessing import Process, Queue
 from pathlib import Path
 from typing import Any, Optional
 from urllib.parse import urlparse
@@ -27,6 +28,15 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
 CONNECTOR_TYPE = "delta_table"
+def write_deltalake_with_error_handling(queue, **kwargs):
+    from deltalake.writer import write_deltalake
+    try:
+        write_deltalake(**kwargs)
+    except Exception:
+        queue.put(traceback.format_exc())
 class DeltaTableAccessConfig(AccessConfig):
     aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
     aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
@@ -157,7 +167,6 @@ class DeltaTableUploader(Uploader):
     @requires_dependencies(["deltalake"], extras="delta-table")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        from deltalake.writer import write_deltalake
         df = self.read_dataframe(path)
         updated_upload_path = os.path.join(
@@ -176,17 +185,24 @@ class DeltaTableUploader(Uploader):
             "mode": "overwrite",
             "storage_options": storage_options,
         }
+        queue = Queue()
         # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
         # ingest to fail, even though all tasks are completed normally. Putting the writer into a
         # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
         # rust backend to finish
         writer = Process(
-            target=write_deltalake,
-            kwargs=writer_kwargs,
+            target=write_deltalake_with_error_handling,
+            kwargs={"queue": queue, **writer_kwargs},
         )
         writer.start()
         writer.join()
+        # Check if the queue has any exception message
+        if not queue.empty():
+            error_message = queue.get()
+            logger.error(f"Exception occurred in write_deltalake: {error_message}")
+            raise RuntimeError(f"Error in write_deltalake: {error_message}")
 delta_table_destination_entry = DestinationRegistryEntry(
     connection_config=DeltaTableConnectionConfig,

unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from unstructured_ingest.v2.processes.connector_registry import (
+    add_destination_entry,
+    add_source_entry,
+)
+from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
+from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
+from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
+from .opensearch import opensearch_destination_entry, opensearch_source_entry
+add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
+add_destination_entry(
+    destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
+)
+add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
+add_destination_entry(
+    destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
+)

unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} RENAMED Viewed

@@ -2,6 +2,7 @@ import hashlib
 import json
 import sys
 import uuid
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
 from time import time
@@ -13,9 +14,11 @@ from unstructured_ingest.error import (
     DestinationConnectionError,
     SourceConnectionError,
     SourceConnectionNetworkError,
+    WriteError,
 )
 from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -26,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
     FileDataSourceMetadata,
     Indexer,
     IndexerConfig,
+    SourceIdentifiers,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -116,19 +120,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
         return client_kwargs
     @requires_dependencies(["elasticsearch"], extras="elasticsearch")
-    def get_client(self) -> "ElasticsearchClient":
+    @contextmanager
+    def get_client(self) -> Generator["ElasticsearchClient", None, None]:
         from elasticsearch import Elasticsearch as ElasticsearchClient
-        client = ElasticsearchClient(**self.get_client_kwargs())
-        self.check_connection(client=client)
-        return client
-    def check_connection(self, client: "ElasticsearchClient"):
-        try:
-            client.perform_request("HEAD", "/", headers={"accept": "application/json"})
-        except Exception as e:
-            logger.error(f"failed to validate connection: {e}", exc_info=True)
-            raise SourceConnectionError(f"failed to validate connection: {e}")
+        with ElasticsearchClient(**self.get_client_kwargs()) as client:
+            yield client
 class ElasticsearchIndexerConfig(IndexerConfig):
@@ -144,7 +141,16 @@ class ElasticsearchIndexer(Indexer):
     def precheck(self) -> None:
         try:
-            self.connection_config.get_client()
+            with self.connection_config.get_client() as client:
+                if not client.ping():
+                    raise SourceConnectionError("cluster not detected")
+                indices = client.indices.get_alias(index="*")
+                if self.index_config.index_name not in indices:
+                    raise SourceConnectionError(
+                        "index {} not found: {}".format(
+                            self.index_config.index_name, ", ".join(indices.keys())
+                        )
+                    )
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -160,15 +166,15 @@ class ElasticsearchIndexer(Indexer):
         scan = self.load_scan()
         scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
-        client = self.connection_config.get_client()
-        hits = scan(
-            client,
-            query=scan_query,
-            scroll="1m",
-            index=self.index_config.index_name,
-        )
+        with self.connection_config.get_client() as client:
+            hits = scan(
+                client,
+                query=scan_query,
+                scroll="1m",
+                index=self.index_config.index_name,
+            )
-        return {hit["_id"] for hit in hits}
+            return {hit["_id"] for hit in hits}
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
         all_ids = self._get_doc_ids()
@@ -257,6 +263,7 @@ class ElasticsearchDownloader(Downloader):
             file_data=FileData(
                 identifier=filename_id,
                 connector_type=CONNECTOR_TYPE,
+                source_identifiers=SourceIdentifiers(filename=filename, fullpath=filename),
                 metadata=FileDataSourceMetadata(
                     version=str(result["_version"]) if "_version" in result else None,
                     date_processed=str(time()),
@@ -318,7 +325,7 @@ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
 class ElasticsearchUploadStager(UploadStager):
     upload_stager_config: ElasticsearchUploadStagerConfig
-    def conform_dict(self, data: dict) -> dict:
+    def conform_dict(self, data: dict, file_data: FileData) -> dict:
         resp = {
             "_index": self.upload_stager_config.index_name,
             "_id": str(uuid.uuid4()),
@@ -327,6 +334,7 @@ class ElasticsearchUploadStager(UploadStager):
                 "embeddings": data.pop("embeddings", None),
                 "text": data.pop("text", None),
                 "type": data.pop("type", None),
+                RECORD_ID_LABEL: file_data.identifier,
             },
         }
         if "metadata" in data and isinstance(data["metadata"], dict):
@@ -343,10 +351,17 @@ class ElasticsearchUploadStager(UploadStager):
     ) -> Path:
         with open(elements_filepath) as elements_file:
             elements_contents = json.load(elements_file)
-        conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
+        conformed_elements = [
+            self.conform_dict(data=element, file_data=file_data) for element in elements_contents
+        ]
+        if Path(output_filename).suffix != ".json":
+            output_filename = f"{output_filename}.json"
+        else:
+            output_filename = f"{Path(output_filename).stem}.json"
+        output_path = Path(output_dir) / output_filename
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file)
+            json.dump(conformed_elements, output_file, indent=2)
         return output_path
@@ -363,6 +378,10 @@ class ElasticsearchUploaderConfig(UploaderConfig):
     num_threads: int = Field(
         default=4, description="Number of threads to be used while uploading content"
     )
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
@@ -373,7 +392,16 @@ class ElasticsearchUploader(Uploader):
     def precheck(self) -> None:
         try:
-            self.connection_config.get_client()
+            with self.connection_config.get_client() as client:
+                if not client.ping():
+                    raise DestinationConnectionError("cluster not detected")
+                indices = client.indices.get_alias(index="*")
+                if self.upload_config.index_name not in indices:
+                    raise SourceConnectionError(
+                        "index {} not found: {}".format(
+                            self.upload_config.index_name, ", ".join(indices.keys())
+                        )
+                    )
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -384,6 +412,23 @@ class ElasticsearchUploader(Uploader):
         return parallel_bulk
+    def delete_by_record_id(self, client, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
+            f"from {self.upload_config.index_name} index"
+        )
+        delete_resp = client.delete_by_query(
+            index=self.upload_config.index_name,
+            body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
+        )
+        logger.info(
+            "deleted {} records from index {}".format(
+                delete_resp["deleted"], self.upload_config.index_name
+            )
+        )
+        if failures := delete_resp.get("failures"):
+            raise WriteError(f"failed to delete records: {failures}")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         parallel_bulk = self.load_parallel_bulk()
         with path.open("r") as file:
@@ -397,28 +442,29 @@ class ElasticsearchUploader(Uploader):
             f"{self.upload_config.num_threads} (number of) threads"
         )
-        client = self.connection_config.get_client()
-        if not client.indices.exists(index=self.upload_config.index_name):
-            logger.warning(
-                f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
-                f"{self.upload_config.index_name}. "
-                f"This may cause issues when uploading."
-            )
-        for batch in generator_batching_wbytes(
-            elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
-        ):
-            for success, info in parallel_bulk(
-                client=client,
-                actions=batch,
-                thread_count=self.upload_config.num_threads,
+        with self.connection_config.get_client() as client:
+            self.delete_by_record_id(client=client, file_data=file_data)
+            if not client.indices.exists(index=self.upload_config.index_name):
+                logger.warning(
+                    f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
+                    f"{self.upload_config.index_name}. "
+                    f"This may cause issues when uploading."
+                )
+            for batch in generator_batching_wbytes(
+                elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
             ):
-                if not success:
-                    logger.error(
-                        "upload failed for a batch in "
-                        f"{(self.__class__.__name__).replace('Uploader', '')} "
-                        "destination connector:",
-                        info,
-                    )
+                for success, info in parallel_bulk(
+                    client=client,
+                    actions=batch,
+                    thread_count=self.upload_config.num_threads,
+                ):
+                    if not success:
+                        logger.error(
+                            "upload failed for a batch in "
+                            f"{(self.__class__.__name__).replace('Uploader', '')} "
+                            "destination connector:",
+                            info,
+                        )
 elasticsearch_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} RENAMED Viewed

@@ -17,7 +17,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
 )
-from unstructured_ingest.v2.processes.connectors.elasticsearch import (
+from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
     ElasticsearchDownloader,
     ElasticsearchDownloaderConfig,
     ElasticsearchIndexer,

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -161,7 +161,7 @@ class GoogleDriveIndexer(Indexer):
             and isinstance(parent_root_path, str)
         ):
             fullpath = f"{parent_path}/{filename}"
-            rel_path = fullpath.replace(parent_root_path, "")
+            rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
             source_identifiers = SourceIdentifiers(
                 filename=filename, fullpath=fullpath, rel_path=rel_path
             )

unstructured_ingest/v2/processes/connectors/kafka/kafka.py CHANGED Viewed

@@ -161,6 +161,12 @@ class KafkaIndexer(Indexer, ABC):
                 current_topics = [
                     topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
                 ]
+                if self.index_config.topic not in current_topics:
+                    raise SourceConnectionError(
+                        "expected topic {} not detected in cluster: {}".format(
+                            self.index_config.topic, ", ".join(current_topics)
+                        )
+                    )
                 logger.info(f"successfully checked available topics: {current_topics}")
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)

unstructured_ingest/v2/processes/connectors/lancedb/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from __future__ import annotations
+from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
+from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
+from .aws import lancedb_aws_destination_entry
+from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
+from .azure import lancedb_azure_destination_entry
+from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
+from .gcp import lancedb_gcp_destination_entry
+from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
+from .local import lancedb_local_destination_entry
+add_destination_entry(LANCEDB_S3_CONNECTOR_TYPE, lancedb_aws_destination_entry)
+add_destination_entry(LANCEDB_AZURE_CONNECTOR_TYPE, lancedb_azure_destination_entry)
+add_destination_entry(LANCEDB_GCS_CONNECTOR_TYPE, lancedb_gcp_destination_entry)
+add_destination_entry(LANCEDB_LOCAL_CONNECTOR_TYPE, lancedb_local_destination_entry)

unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl