PyPI - unstructured-ingest - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

unstructured-ingest 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (24) hide show

test/integration/connectors/test_lancedb.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 from typing import Literal, Union
+from uuid import uuid4
 import lancedb
 import pandas as pd
@@ -13,9 +14,9 @@ from upath import UPath
 from test.integration.connectors.utils.constants import DESTINATION_TAG
 from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
-    LanceDBS3AccessConfig,
-    LanceDBS3ConnectionConfig,
-    LanceDBS3Uploader,
+    LanceDBAwsAccessConfig,
+    LanceDBAwsConnectionConfig,
+    LanceDBAwsUploader,
 )
 from unstructured_ingest.v2.processes.connectors.lancedb.azure import (
     LanceDBAzureAccessConfig,
@@ -150,12 +151,12 @@ def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path)
     elif target == "az":
         base_uri = UPath(AZURE_BUCKET)
-    return str(base_uri / "destination" / "lancedb" / DATABASE_NAME)
+    return str(base_uri / "destination" / "lancedb" / str(uuid4()) / DATABASE_NAME)
 def _get_uploader(
     uri: str,
-) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBS3Uploader, LanceDBGSPUploader]:
+) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
     target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
     if target == "az":
         azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
@@ -169,10 +170,10 @@ def _get_uploader(
         )
     elif target == "s3":
-        return LanceDBS3Uploader(
+        return LanceDBAwsUploader(
             upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
-            connection_config=LanceDBS3ConnectionConfig(
-                access_config=LanceDBS3AccessConfig(
+            connection_config=LanceDBAwsConnectionConfig(
+                access_config=LanceDBAwsAccessConfig(
                     aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
                     aws_secret_access_key=os.getenv("S3_INGEST_TEST_SECRET_KEY"),
                 ),

test/integration/connectors/test_milvus.py CHANGED Viewed

@@ -15,6 +15,7 @@ from pymilvus.milvus_client import IndexParams
 from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
 from test.integration.connectors.utils.docker import healthcheck_wait
 from test.integration.connectors.utils.docker_compose import docker_compose_context
+from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.milvus import (
     CONNECTOR_TYPE,
@@ -24,9 +25,10 @@ from unstructured_ingest.v2.processes.connectors.milvus import (
     MilvusUploadStager,
 )
-DB_URI = "http://localhost:19530"
 DB_NAME = "test_database"
-COLLECTION_NAME = "test_collection"
+EXISTENT_COLLECTION_NAME = "test_collection"
+NONEXISTENT_COLLECTION_NAME = "nonexistent_collection"
+DB_URI = "http://localhost:19530"
 def get_schema() -> CollectionSchema:
@@ -55,7 +57,9 @@ def get_index_params() -> IndexParams:
     return index_params
-@pytest.fixture
+# NOTE: Precheck tests are read-only so they don't interfere with destination test,
+# using scope="module" we can limit number of times the docker-compose has to be run
+@pytest.fixture(scope="module")
 def collection():
     docker_client = docker.from_env()
     with docker_compose_context(docker_compose_path=env_setup_path / "milvus"):
@@ -73,10 +77,10 @@ def collection():
             schema = get_schema()
             index_params = get_index_params()
             collection_resp = milvus_client.create_collection(
-                collection_name=COLLECTION_NAME, schema=schema, index_params=index_params
+                collection_name=EXISTENT_COLLECTION_NAME, schema=schema, index_params=index_params
             )
-            print(f"Created collection {COLLECTION_NAME}: {collection_resp}")
-            yield COLLECTION_NAME
+            print(f"Created collection {EXISTENT_COLLECTION_NAME}: {collection_resp}")
+            yield EXISTENT_COLLECTION_NAME
         finally:
             milvus_client.close()
@@ -139,3 +143,27 @@ async def test_milvus_destination(
     uploader.run(path=staged_filepath, file_data=file_data)
     with uploader.get_client() as client:
         validate_count(client=client, expected_count=expected_count)
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+def test_precheck_succeeds(collection: str):
+    uploader = MilvusUploader(
+        connection_config=MilvusConnectionConfig(uri=DB_URI),
+        upload_config=MilvusUploaderConfig(db_name=DB_NAME, collection_name=collection),
+    )
+    uploader.precheck()
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+def test_precheck_fails_on_nonexistent_collection(collection: str):
+    uploader = MilvusUploader(
+        connection_config=MilvusConnectionConfig(uri=DB_URI),
+        upload_config=MilvusUploaderConfig(
+            db_name=DB_NAME, collection_name=NONEXISTENT_COLLECTION_NAME
+        ),
+    )
+    with pytest.raises(
+        DestinationConnectionError,
+        match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
+    ):
+        uploader.precheck()

test/integration/connectors/test_mongodb.py ADDED Viewed

@@ -0,0 +1,332 @@
+import json
+import os
+import time
+import uuid
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator
+import pytest
+from pydantic import BaseModel, SecretStr
+from pymongo.collection import Collection
+from pymongo.database import Database
+from pymongo.mongo_client import MongoClient
+from pymongo.operations import SearchIndexModel
+from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
+from test.integration.connectors.utils.validation import (
+    ValidationConfigs,
+    source_connector_validation,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.mongodb import (
+    CONNECTOR_TYPE,
+    MongoDBAccessConfig,
+    MongoDBConnectionConfig,
+    MongoDBDownloader,
+    MongoDBDownloaderConfig,
+    MongoDBIndexer,
+    MongoDBIndexerConfig,
+    MongoDBUploader,
+    MongoDBUploaderConfig,
+)
+SOURCE_COLLECTION = "sample-mongodb-data"
+class EnvData(BaseModel):
+    uri: SecretStr
+    database: str
+def get_env_data() -> EnvData:
+    uri = os.getenv("MONGODB_URI")
+    assert uri
+    database = os.getenv("MONGODB_DATABASE")
+    assert database
+    return EnvData(uri=uri, database=database)
+@contextmanager
+def get_client() -> Generator[MongoClient, None, None]:
+    uri = get_env_data().uri.get_secret_value()
+    with MongoClient(uri) as client:
+        assert client.admin.command("ping")
+        yield client
+def wait_for_collection(
+    database: Database, collection_name: str, retries: int = 10, interval: int = 1
+):
+    collections = database.list_collection_names()
+    attempts = 0
+    while collection_name not in collections and attempts < retries:
+        attempts += 1
+        print(
+            "Waiting for collection {} to be recognized: {}".format(
+                collection_name, ", ".join(collections)
+            )
+        )
+        time.sleep(interval)
+        collections = database.list_collection_names()
+    if collection_name not in collection_name:
+        raise TimeoutError(f"Collection {collection_name} was not recognized")
+def get_search_index_status(collection: Collection, index_name: str) -> str:
+    search_indexes = collection.list_search_indexes(name=index_name)
+    search_index = list(search_indexes)[0]
+    return search_index["status"]
+def wait_for_search_index(
+    collection: Collection, index_name: str, retries: int = 60, interval: int = 1
+):
+    current_status = get_search_index_status(collection, index_name)
+    attempts = 0
+    while current_status != "READY" and attempts < retries:
+        attempts += 1
+        print(f"attempt {attempts}: waiting for search index to be READY: {current_status}")
+        time.sleep(interval)
+        current_status = get_search_index_status(collection, index_name)
+    if current_status != "READY":
+        raise TimeoutError("search index never detected as READY")
+@pytest.fixture
+def destination_collection() -> Collection:
+    env_data = get_env_data()
+    collection_name = f"utic-test-output-{uuid.uuid4()}"
+    with get_client() as client:
+        database = client[env_data.database]
+        print(f"creating collection in database {database}: {collection_name}")
+        collection = database.create_collection(name=collection_name)
+        search_index_name = "embeddings"
+        collection.create_search_index(
+            model=SearchIndexModel(
+                name=search_index_name,
+                definition={
+                    "mappings": {
+                        "dynamic": True,
+                        "fields": {
+                            "embeddings": [
+                                {"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
+                            ]
+                        },
+                    }
+                },
+            )
+        )
+        collection.create_index("record_id")
+        wait_for_collection(database=database, collection_name=collection_name)
+        wait_for_search_index(collection=collection, index_name=search_index_name)
+        try:
+            yield collection
+        finally:
+            print(f"deleting collection: {collection_name}")
+            collection.drop()
+def validate_collection_count(
+    collection: Collection, expected_records: int, retries: int = 10, interval: int = 1
+) -> None:
+    count = collection.count_documents(filter={})
+    attempt = 0
+    while count != expected_records and attempt < retries:
+        attempt += 1
+        print(f"attempt {attempt} to get count of collection {count} to match {expected_records}")
+        time.sleep(interval)
+        count = collection.count_documents(filter={})
+    assert (
+        count == expected_records
+    ), f"expected count ({expected_records}) does not match how many records were found: {count}"
+def validate_collection_vector(
+    collection: Collection, embedding: list[float], text: str, retries: int = 30, interval: int = 1
+) -> None:
+    pipeline = [
+        {
+            "$vectorSearch": {
+                "index": "embeddings",
+                "path": "embeddings",
+                "queryVector": embedding,
+                "numCandidates": 150,
+                "limit": 10,
+            },
+        },
+        {"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
+    ]
+    attempts = 0
+    results = list(collection.aggregate(pipeline=pipeline))
+    while not results and attempts < retries:
+        attempts += 1
+        print(f"attempt {attempts}, waiting for valid results: {results}")
+        time.sleep(interval)
+        results = list(collection.aggregate(pipeline=pipeline))
+    if not results:
+        raise TimeoutError("Timed out waiting for valid results")
+    print(f"found results on attempt {attempts}")
+    top_result = results[0]
+    assert top_result["score"] == 1.0, "score detected should be 1: {}".format(top_result["score"])
+    assert top_result["text"] == text, "text detected should be {}, found: {}".format(
+        text, top_result["text"]
+    )
+    for r in results[1:]:
+        assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("MONGODB_URI", "MONGODB_DATABASE")
+async def test_mongodb_source(temp_dir: Path):
+    env_data = get_env_data()
+    indexer_config = MongoDBIndexerConfig(database=env_data.database, collection=SOURCE_COLLECTION)
+    download_config = MongoDBDownloaderConfig(download_dir=temp_dir)
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
+    )
+    indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
+    downloader = MongoDBDownloader(
+        connection_config=connection_config, download_config=download_config
+    )
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=ValidationConfigs(
+            test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
+        ),
+    )
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+def test_mongodb_indexer_precheck_fail_no_host():
+    indexer_config = MongoDBIndexerConfig(
+        database="non-existent-database", collection="non-existent-database"
+    )
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
+    )
+    indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
+    with pytest.raises(SourceConnectionError):
+        indexer.precheck()
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("MONGODB_URI", "MONGODB_DATABASE")
+def test_mongodb_indexer_precheck_fail_no_database():
+    env_data = get_env_data()
+    indexer_config = MongoDBIndexerConfig(
+        database="non-existent-database", collection=SOURCE_COLLECTION
+    )
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
+    )
+    indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
+    with pytest.raises(SourceConnectionError):
+        indexer.precheck()
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("MONGODB_URI", "MONGODB_DATABASE")
+def test_mongodb_indexer_precheck_fail_no_collection():
+    env_data = get_env_data()
+    indexer_config = MongoDBIndexerConfig(
+        database=env_data.database, collection="non-existent-collection"
+    )
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
+    )
+    indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
+    with pytest.raises(SourceConnectionError):
+        indexer.precheck()
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("MONGODB_URI", "MONGODB_DATABASE")
+async def test_mongodb_destination(
+    upload_file: Path,
+    destination_collection: Collection,
+    tmp_path: Path,
+):
+    env_data = get_env_data()
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mongodb_mock_id",
+    )
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
+    )
+    upload_config = MongoDBUploaderConfig(
+        database=env_data.database,
+        collection=destination_collection.name,
+    )
+    uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
+    uploader.precheck()
+    uploader.run(path=upload_file, file_data=file_data)
+    with upload_file.open() as f:
+        staged_elements = json.load(f)
+    expected_records = len(staged_elements)
+    validate_collection_count(collection=destination_collection, expected_records=expected_records)
+    first_element = staged_elements[0]
+    validate_collection_vector(
+        collection=destination_collection,
+        embedding=first_element["embeddings"],
+        text=first_element["text"],
+    )
+    uploader.run(path=upload_file, file_data=file_data)
+    validate_collection_count(collection=destination_collection, expected_records=expected_records)
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+def test_mongodb_uploader_precheck_fail_no_host():
+    upload_config = MongoDBUploaderConfig(
+        database="database",
+        collection="collection",
+    )
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
+    )
+    uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
+    with pytest.raises(DestinationConnectionError):
+        uploader.precheck()
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("MONGODB_URI", "MONGODB_DATABASE")
+def test_mongodb_uploader_precheck_fail_no_database():
+    env_data = get_env_data()
+    upload_config = MongoDBUploaderConfig(
+        database="database",
+        collection="collection",
+    )
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
+    )
+    uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
+    with pytest.raises(DestinationConnectionError):
+        uploader.precheck()
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("MONGODB_URI", "MONGODB_DATABASE")
+def test_mongodb_uploader_precheck_fail_no_collection():
+    env_data = get_env_data()
+    upload_config = MongoDBUploaderConfig(
+        database=env_data.database,
+        collection="collection",
+    )
+    connection_config = MongoDBConnectionConfig(
+        access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
+    )
+    uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
+    with pytest.raises(DestinationConnectionError):
+        uploader.precheck()

test/integration/connectors/weaviate/test_cloud.py ADDED Viewed

@@ -0,0 +1,34 @@
+import pytest
+from pydantic import ValidationError
+from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
+    CloudWeaviateAccessConfig,
+    CloudWeaviateConnectionConfig,
+)
+def test_weaviate_failing_connection_config():
+    with pytest.raises(ValidationError):
+        CloudWeaviateConnectionConfig(
+            access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
+            username="username",
+            cluster_url="clusterurl",
+        )
+def test_weaviate_connection_config_happy_path():
+    CloudWeaviateConnectionConfig(
+        access_config=CloudWeaviateAccessConfig(
+            api_key="my key",
+        ),
+        cluster_url="clusterurl",
+    )
+def test_weaviate_connection_config_anonymous():
+    CloudWeaviateConnectionConfig(
+        access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
+        username="username",
+        anonymous=True,
+        cluster_url="clusterurl",
+    )

test/unit/test_utils.py CHANGED Viewed

@@ -8,7 +8,11 @@ import pytz
 from unstructured_ingest.cli.utils import extract_config
 from unstructured_ingest.interfaces import BaseConfig
-from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
+from unstructured_ingest.utils.string_and_date_utils import (
+    ensure_isoformat_datetime,
+    json_to_dict,
+    truncate_string_bytes,
+)
 @dataclass
@@ -162,3 +166,19 @@ def test_ensure_isoformat_datetime_fails_on_string():
 def test_ensure_isoformat_datetime_fails_on_int():
     with pytest.raises(TypeError):
         ensure_isoformat_datetime(1111)
+def test_truncate_string_bytes_return_truncated_string():
+    test_string = "abcdef안녕하세요ghijklmn방갑습니opqrstu 더 길어지면 안되는 문자열vwxyz"
+    max_bytes = 11
+    result = truncate_string_bytes(test_string, max_bytes)
+    assert result == "abcdef안"
+    assert len(result.encode("utf-8")) <= max_bytes
+def test_truncate_string_bytes_return_untouched_string():
+    test_string = "abcdef"
+    max_bytes = 11
+    result = truncate_string_bytes(test_string, max_bytes)
+    assert result == "abcdef"
+    assert len(result.encode("utf-8")) <= max_bytes

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.1" # pragma: no cover
1	+ __version__ = "0.3.3" # pragma: no cover

unstructured_ingest/utils/string_and_date_utils.py CHANGED Viewed

@@ -37,3 +37,13 @@ def ensure_isoformat_datetime(timestamp: t.Union[datetime, str]) -> str:
             raise ValueError(f"String '{timestamp}' could not be parsed as a datetime.") from e
     else:
         raise TypeError(f"Expected input type datetime or str, but got {type(timestamp)}.")
+def truncate_string_bytes(string: str, max_bytes: int, encoding: str = "utf-8") -> str:
+    """
+    Truncates a string to a specified maximum number of bytes.
+    """
+    encoded_string = str(string).encode(encoding)
+    if len(encoded_string) <= max_bytes:
+        return string
+    return encoded_string[:max_bytes].decode(encoding, errors="ignore")

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
 )
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
 from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -50,6 +51,8 @@ if TYPE_CHECKING:
 CONNECTOR_TYPE = "astradb"
+MAX_CONTENT_PARAM_BYTE_SIZE = 8000
 class AstraDBAccessConfig(AccessConfig):
     token: str = Field(description="Astra DB Token with access to the database.")
@@ -301,7 +304,20 @@ class AstraDBUploadStager(UploadStager):
         default_factory=lambda: AstraDBUploadStagerConfig()
     )
+    def truncate_dict_elements(self, element_dict: dict) -> None:
+        text = element_dict.pop("text", None)
+        if text is not None:
+            element_dict["text"] = truncate_string_bytes(text, MAX_CONTENT_PARAM_BYTE_SIZE)
+        metadata = element_dict.get("metadata")
+        if metadata is not None and isinstance(metadata, dict):
+            text_as_html = element_dict["metadata"].pop("text_as_html", None)
+            if text_as_html is not None:
+                element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
+                    text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
+                )
     def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        self.truncate_dict_elements(element_dict)
         return {
             "$vector": element_dict.pop("embeddings", None),
             "content": element_dict.pop("text", None),

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -161,7 +161,7 @@ class GoogleDriveIndexer(Indexer):
             and isinstance(parent_root_path, str)
         ):
             fullpath = f"{parent_path}/{filename}"
-            rel_path = fullpath.replace(parent_root_path, "")
+            rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
             source_identifiers = SourceIdentifiers(
                 filename=filename, fullpath=fullpath, rel_path=rel_path
             )

unstructured_ingest/v2/processes/connectors/lancedb/__init__.py CHANGED Viewed

@@ -6,12 +6,25 @@ from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
 from .aws import lancedb_aws_destination_entry
 from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
 from .azure import lancedb_azure_destination_entry
+from .cloud import CONNECTOR_TYPE as LANCEDB_CLOUD_CONNECTOR_TYPE
+from .cloud import lancedb_cloud_destination_entry
 from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
 from .gcp import lancedb_gcp_destination_entry
 from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
 from .local import lancedb_local_destination_entry
-add_destination_entry(LANCEDB_S3_CONNECTOR_TYPE, lancedb_aws_destination_entry)
-add_destination_entry(LANCEDB_AZURE_CONNECTOR_TYPE, lancedb_azure_destination_entry)
-add_destination_entry(LANCEDB_GCS_CONNECTOR_TYPE, lancedb_gcp_destination_entry)
-add_destination_entry(LANCEDB_LOCAL_CONNECTOR_TYPE, lancedb_local_destination_entry)
+add_destination_entry(
+    destination_type=LANCEDB_S3_CONNECTOR_TYPE, entry=lancedb_aws_destination_entry
+)
+add_destination_entry(
+    destination_type=LANCEDB_AZURE_CONNECTOR_TYPE, entry=lancedb_azure_destination_entry
+)
+add_destination_entry(
+    destination_type=LANCEDB_GCS_CONNECTOR_TYPE, entry=lancedb_gcp_destination_entry
+)
+add_destination_entry(
+    destination_type=LANCEDB_LOCAL_CONNECTOR_TYPE, entry=lancedb_local_destination_entry
+)
+add_destination_entry(
+    destination_type=LANCEDB_CLOUD_CONNECTOR_TYPE, entry=lancedb_cloud_destination_entry
+)

unstructured_ingest/v2/processes/connectors/lancedb/aws.py CHANGED Viewed

@@ -15,28 +15,28 @@ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
 CONNECTOR_TYPE = "lancedb_aws"
-class LanceDBS3AccessConfig(AccessConfig):
+class LanceDBAwsAccessConfig(AccessConfig):
     aws_access_key_id: str = Field(description="The AWS access key ID to use.")
     aws_secret_access_key: str = Field(description="The AWS secret access key to use.")
-class LanceDBS3ConnectionConfig(LanceDBRemoteConnectionConfig):
-    access_config: Secret[LanceDBS3AccessConfig]
+class LanceDBAwsConnectionConfig(LanceDBRemoteConnectionConfig):
+    access_config: Secret[LanceDBAwsAccessConfig]
     def get_storage_options(self) -> dict:
         return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
 @dataclass
-class LanceDBS3Uploader(LanceDBUploader):
+class LanceDBAwsUploader(LanceDBUploader):
     upload_config: LanceDBUploaderConfig
-    connection_config: LanceDBS3ConnectionConfig
+    connection_config: LanceDBAwsConnectionConfig
     connector_type: str = CONNECTOR_TYPE
 lancedb_aws_destination_entry = DestinationRegistryEntry(
-    connection_config=LanceDBS3ConnectionConfig,
-    uploader=LanceDBS3Uploader,
+    connection_config=LanceDBAwsConnectionConfig,
+    uploader=LanceDBAwsUploader,
     uploader_config=LanceDBUploaderConfig,
     upload_stager_config=LanceDBUploadStagerConfig,
     upload_stager=LanceDBUploadStager,

unstructured-ingest 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl