PyPI - unstructured-ingest - Versions diffs - 0.5.18__py3-none-any.whl → 0.5.20__py3-none-any.whl - Mend

unstructured-ingest 0.5.18py3-none-any.whl → 0.5.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (16) hide show

test/integration/connectors/test_redis.py CHANGED Viewed

@@ -23,20 +23,22 @@ from unstructured_ingest.v2.processes.connectors.redisdb import (
 )
-async def delete_record(client: Redis, element_id: str) -> None:
-    await client.delete(element_id)
+async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
+    key_with_prefix = f"{key_prefix}{element_id}"
+    await client.delete(key_with_prefix)
-async def validate_upload(client: Redis, first_element: dict):
+async def validate_upload(client: Redis, first_element: dict, key_prefix: str) -> None:
     element_id = first_element["element_id"]
+    key_with_prefix = f"{key_prefix}{element_id}"
     expected_text = first_element["text"]
     expected_embeddings = first_element["embeddings"]
     async with client.pipeline(transaction=True) as pipe:
         try:
-            response = await pipe.json().get(element_id, "$").execute()
+            response = await pipe.json().get(key_with_prefix, "$").execute()
             response = response[0][0]
         except redis_exceptions.ResponseError:
-            response = await pipe.get(element_id).execute()
+            response = await pipe.get(key_with_prefix).execute()
             response = json.loads(response[0])
     embedding_similarity = np.linalg.norm(
@@ -53,6 +55,7 @@ async def redis_destination_test(
     upload_file: Path,
     tmp_path: Path,
     connection_kwargs: dict,
+    uploader_config: dict,
     uri: Optional[str] = None,
     password: Optional[str] = None,
 ):
@@ -60,8 +63,9 @@ async def redis_destination_test(
         connection_config=RedisConnectionConfig(
             **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
         ),
-        upload_config=RedisUploaderConfig(batch_size=10),
+        upload_config=RedisUploaderConfig(batch_size=10, **uploader_config),
     )
+    key_prefix = uploader.upload_config.key_prefix
     file_data = FileData(
         source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -78,20 +82,32 @@ async def redis_destination_test(
         if uri:
             async with from_url(uri) as client:
-                await validate_upload(client=client, first_element=first_element)
+                await validate_upload(
+                    client=client,
+                    first_element=first_element,
+                    key_prefix=key_prefix,
+                )
         else:
             async with Redis(**connection_kwargs, password=password) as client:
-                await validate_upload(client=client, first_element=first_element)
+                await validate_upload(
+                    client=client,
+                    first_element=first_element,
+                    key_prefix=key_prefix,
+                )
     except Exception as e:
         raise e
     finally:
         if uri:
             async with from_url(uri) as client:
-                tasks = [delete_record(client, element["element_id"]) for element in elements]
+                tasks = [
+                    delete_record(client, element["element_id"], key_prefix) for element in elements
+                ]
                 await asyncio.gather(*tasks)
         else:
             async with Redis(**connection_kwargs, password=password) as client:
-                tasks = [delete_record(client, element["element_id"]) for element in elements]
+                tasks = [
+                    delete_record(client, element["element_id"], key_prefix) for element in elements
+                ]
                 await asyncio.gather(*tasks)
@@ -105,8 +121,13 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
         "db": 0,
         "ssl": True,
     }
+    uploader_config = {
+        "key_prefix": "test_ingest:",
+    }
     redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
-    await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
+    await redis_destination_test(
+        upload_file, tmp_path, connection_kwargs, uploader_config, password=redis_pw
+    )
 @pytest.mark.asyncio
@@ -114,6 +135,9 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
 @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
 async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
     connection_kwargs = {}
+    uploader_config = {
+        "key_prefix": "test_ingest:",
+    }
     redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
     uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
-    await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
+    await redis_destination_test(upload_file, tmp_path, connection_kwargs, uploader_config, uri=uri)

test/unit/v2/connectors/ibm_watsonx/__init__.py ADDED Viewed

File without changes

test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py ADDED Viewed

@@ -0,0 +1,459 @@
+import time
+from unittest.mock import MagicMock
+import pandas as pd
+import pytest
+from pydantic import Secret
+from pyiceberg.exceptions import CommitFailedException
+from pytest_mock import MockerFixture
+from unstructured_ingest.v2.errors import ProviderError, UserError
+from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
+from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
+    IbmWatsonxAccessConfig,
+    IbmWatsonxConnectionConfig,
+    IbmWatsonxUploader,
+    IbmWatsonxUploaderConfig,
+)
+@pytest.fixture
+def file_data():
+    return FileData(
+        identifier="test_identifier",
+        connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
+        source_identifiers=SourceIdentifiers(
+            filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
+        ),
+    )
+@pytest.fixture
+def access_config():
+    return IbmWatsonxAccessConfig(
+        iam_api_key="test_iam_api_key",
+        access_key_id="test_access_key_id",
+        secret_access_key="test_secret_access_key",
+    )
+@pytest.fixture
+def connection_config(access_config: IbmWatsonxAccessConfig):
+    return IbmWatsonxConnectionConfig(
+        access_config=Secret(access_config),
+        iceberg_endpoint="test_iceberg_endpoint/",
+        object_storage_endpoint="test_object_storage_endpoint/",
+        object_storage_region="test_region",
+        catalog="test_catalog",
+    )
+@pytest.fixture
+def uploader_config():
+    return IbmWatsonxUploaderConfig(
+        namespace="test_namespace",
+        table="test_table",
+        record_id_key="test_record_id_key",
+    )
+@pytest.fixture
+def uploader(
+    connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
+):
+    return IbmWatsonxUploader(
+        connection_config=connection_config,
+        upload_config=uploader_config,
+    )
+@pytest.fixture
+def mock_catalog(mocker: MockerFixture):
+    mock_catalog = mocker.MagicMock()
+    mock_catalog.namespace_exists.return_value = True
+    mock_catalog.table_exists.return_value = True
+    return mock_catalog
+@pytest.fixture
+def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
+    mock_get_catalog = mocker.patch.context_manager(
+        IbmWatsonxConnectionConfig, "get_catalog", autospec=True
+    )
+    mock_get_catalog.return_value.__enter__.return_value = mock_catalog
+    return mock_get_catalog
+@pytest.fixture
+def mock_table(mocker: MockerFixture):
+    mock_table = mocker.MagicMock()
+    return mock_table
+@pytest.fixture
+def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
+    mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
+    mock_get_table.return_value.__enter__.return_value = mock_table
+    return mock_get_table
+@pytest.fixture
+def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
+    mock_transaction = mocker.MagicMock()
+    mock_table.transaction.return_value.__enter__.return_value = mock_transaction
+    return mock_transaction
+@pytest.fixture
+def mock_data_table(mocker: MockerFixture):
+    mock_data_table = mocker.MagicMock()
+    mock_data_table.schema = "schema"
+    return mock_data_table
+@pytest.fixture
+def mock_delete(mocker: MockerFixture):
+    return mocker.patch.object(IbmWatsonxUploader, "_delete")
+@pytest.fixture
+def test_df():
+    return pd.DataFrame(
+        {
+            "test_column_0": [True, False, True],
+            "test_column_1": [1, 2, 3],
+            "test_column_2": ["a", "b", "c"],
+        }
+    )
+@pytest.fixture
+def timestamp_now():
+    return int(time.time())
+def test_ibm_watsonx_connection_config_iceberg_url(
+    mocker: MockerFixture,
+    connection_config: IbmWatsonxConnectionConfig,
+):
+    mocker.patch(
+        "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH",  # noqa: E501
+        new="/mds/iceberg",
+    )
+    expected_url = "https://test_iceberg_endpoint/mds/iceberg"
+    assert connection_config.iceberg_url == expected_url
+def test_ibm_watsonx_connection_config_object_storage_url(
+    connection_config: IbmWatsonxConnectionConfig,
+):
+    expected_url = "https://test_object_storage_endpoint"
+    assert connection_config.object_storage_url == expected_url
+def test_ibm_watsonx_connection_config_bearer_token_new_token(
+    mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
+):
+    mock_generate_bearer_token = mocker.patch.object(
+        IbmWatsonxConnectionConfig,
+        "generate_bearer_token",
+        return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
+    )
+    token = connection_config.bearer_token
+    assert token == "new_token"
+    mock_generate_bearer_token.assert_called_once()
+def test_ibm_watsonx_connection_config_bearer_token_existing_token(
+    mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
+):
+    connection_config._bearer_token = {
+        "access_token": "existing_token",
+        "expiration": timestamp_now + 3600,
+    }
+    mock_generate_bearer_token = mocker.patch.object(
+        IbmWatsonxConnectionConfig, "generate_bearer_token"
+    )
+    token = connection_config.bearer_token
+    assert token == "existing_token"
+    mock_generate_bearer_token.assert_not_called()
+def test_ibm_watsonx_connection_config_bearer_token_expired_token(
+    mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
+):
+    connection_config._bearer_token = {
+        "access_token": "expired_token",
+        "expiration": timestamp_now - 3600,
+    }
+    mock_generate_bearer_token = mocker.patch.object(
+        IbmWatsonxConnectionConfig,
+        "generate_bearer_token",
+        return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
+    )
+    token = connection_config.bearer_token
+    assert token == "new_token"
+    mock_generate_bearer_token.assert_called_once()
+def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
+    mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
+):
+    connection_config._bearer_token = {
+        "access_token": "soon_to_expire_token",
+        "expiration": timestamp_now + 60,
+    }
+    mock_generate_bearer_token = mocker.patch.object(
+        IbmWatsonxConnectionConfig,
+        "generate_bearer_token",
+        return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
+    )
+    token = connection_config.bearer_token
+    assert token == "new_token"
+    mock_generate_bearer_token.assert_called_once()
+def test_ibm_watsonx_connection_config_get_catalog_success(
+    mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
+):
+    mocker.patch(
+        "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH",  # noqa: E501
+        new="/mds/iceberg",
+    )
+    mocker.patch.object(
+        IbmWatsonxConnectionConfig,
+        "bearer_token",
+        new="test_bearer_token",
+    )
+    mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
+    with connection_config.get_catalog() as catalog:
+        assert catalog is not None
+    mock_load_catalog.assert_called_once_with(
+        **{
+            "name": "test_catalog",
+            "type": "rest",
+            "uri": "https://test_iceberg_endpoint/mds/iceberg",
+            "token": "test_bearer_token",
+            "warehouse": "test_catalog",
+            "s3.endpoint": "https://test_object_storage_endpoint",
+            "s3.access-key-id": "test_access_key_id",
+            "s3.secret-access-key": "test_secret_access_key",
+            "s3.region": "test_region",
+        }
+    )
+def test_ibm_watsonx_connection_config_get_catalog_failure(
+    mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
+):
+    mocker.patch(
+        "pyiceberg.catalog.load_catalog",
+        side_effect=Exception("Connection error"),
+    )
+    mocker.patch.object(
+        IbmWatsonxConnectionConfig,
+        "bearer_token",
+        new="test_bearer_token",
+    )
+    with pytest.raises(ProviderError):
+        with connection_config.get_catalog():
+            pass
+def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
+    mock_get_catalog: MagicMock,
+    mock_catalog: MagicMock,
+    uploader: IbmWatsonxUploader,
+):
+    uploader.precheck()
+    mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
+    mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
+def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
+    mock_get_catalog: MagicMock,
+    mock_catalog: MagicMock,
+    uploader: IbmWatsonxUploader,
+):
+    mock_catalog.namespace_exists.return_value = False
+    with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
+        uploader.precheck()
+    mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
+    mock_catalog.table_exists.assert_not_called()
+def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
+    mock_get_catalog: MagicMock,
+    mock_catalog: MagicMock,
+    uploader: IbmWatsonxUploader,
+):
+    mock_catalog.table_exists.return_value = False
+    with pytest.raises(
+        UserError,
+        match="Table 'test_table' does not exist in namespace 'test_namespace'",
+    ):
+        uploader.precheck()
+    mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
+    mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
+def test_ibm_watsonx_uploader_upload_data_table_success(
+    uploader: IbmWatsonxUploader,
+    mock_table: MagicMock,
+    mock_transaction: MagicMock,
+    mock_data_table: MagicMock,
+    mock_delete: MagicMock,
+    file_data: FileData,
+):
+    uploader.upload_data_table(mock_table, mock_data_table, file_data)
+    mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
+    mock_transaction.append.assert_called_once_with(mock_data_table)
+def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
+    uploader: IbmWatsonxUploader,
+    mock_table: MagicMock,
+    mock_transaction: MagicMock,
+    mock_data_table: MagicMock,
+    mock_delete: MagicMock,
+    file_data: FileData,
+):
+    mock_transaction.append.side_effect = CommitFailedException()
+    with pytest.raises(ProviderError):
+        uploader.upload_data_table(mock_table, mock_data_table, file_data)
+    assert mock_table.refresh.call_count == 5
+def test_ibm_watsonx_uploader_upload_data_table_exception(
+    uploader: IbmWatsonxUploader,
+    mock_table: MagicMock,
+    mock_transaction: MagicMock,
+    mock_data_table: MagicMock,
+    mock_delete: MagicMock,
+    file_data: FileData,
+):
+    mock_transaction.append.side_effect = Exception()
+    with pytest.raises(ProviderError):
+        uploader.upload_data_table(mock_table, mock_data_table, file_data)
+    assert mock_table.refresh.call_count == 0
+def test_ibm_watsonx_uploader_df_to_arrow_table(
+    mocker: MockerFixture,
+    uploader: IbmWatsonxUploader,
+    test_df: pd.DataFrame,
+):
+    mock_fit_to_schema = mocker.patch.object(
+        IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
+    )
+    result = uploader._df_to_arrow_table(test_df)
+    mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
+    assert len(result.column_names) == 3
+    assert "test_column_0" in result.column_names
+    assert "test_column_1" in result.column_names
+    assert "test_column_2" in result.column_names
+def test_ibm_watsonx_uploader_can_delete_column_exists(
+    mocker: MockerFixture,
+    uploader: IbmWatsonxUploader,
+):
+    mocker.patch.object(
+        IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
+    )
+    assert uploader.can_delete() is True
+def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
+    mocker: MockerFixture,
+    uploader: IbmWatsonxUploader,
+):
+    mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
+    assert uploader.can_delete() is False
+def test_ibm_watsonx_uploader_get_table_columns_cache(
+    uploader: IbmWatsonxUploader,
+):
+    uploader._columns = ["cached_column"]
+    result = uploader.get_table_columns()
+    assert result == ["cached_column"]
+def test_ibm_watsonx_uploader_get_table_columns_no_cache(
+    uploader: IbmWatsonxUploader,
+    mock_get_table: MagicMock,
+    mock_table: MagicMock,
+):
+    uploader._columns = None
+    mock_table.schema.return_value.column_names = ["column_1", "column_2"]
+    result = uploader.get_table_columns()
+    mock_get_table.assert_called_once()
+    assert result == ["column_1", "column_2"]
+    assert uploader._columns == ["column_1", "column_2"]
+def test_ibm_watsonx_uploader_upload_dataframe_success(
+    mocker: MockerFixture,
+    uploader: IbmWatsonxUploader,
+    test_df: pd.DataFrame,
+    mock_get_table: MagicMock,
+    mock_table: MagicMock,
+    mock_data_table: MagicMock,
+    file_data: FileData,
+):
+    mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
+    mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
+    uploader.upload_dataframe(test_df, file_data)
+    mock_get_table.assert_called_once()
+    mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
+def test_ibm_watsonx_uploader_delete_can_delete(
+    mocker: MockerFixture,
+    uploader: IbmWatsonxUploader,
+    mock_transaction: MagicMock,
+):
+    mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
+    mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
+    uploader._delete(mock_transaction, "test_identifier")
+    mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
+    mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
+def test_ibm_watsonx_uploader_delete_cannot_delete(
+    caplog: pytest.LogCaptureFixture,
+    mocker: MockerFixture,
+    uploader: IbmWatsonxUploader,
+    mock_transaction: MagicMock,
+):
+    mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
+    uploader._delete(mock_transaction, "test_identifier")
+    mock_transaction.delete.assert_not_called()
+    assert (
+        "Table doesn't contain expected record id column test_record_id_key, skipping delete"
+        in caplog.text
+    )

test/unit/v2/connectors/sql/test_sql.py CHANGED Viewed

@@ -1,10 +1,16 @@
 from pathlib import Path
+import pandas as pd
 import pytest
 from pytest_mock import MockerFixture
 from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
-from unstructured_ingest.v2.processes.connectors.sql.sql import SQLUploadStager
+from unstructured_ingest.v2.processes.connectors.sql.sql import (
+    SQLConnectionConfig,
+    SQLUploader,
+    SQLUploaderConfig,
+    SQLUploadStager,
+)
 @pytest.fixture
@@ -12,6 +18,17 @@ def mock_instance() -> SQLUploadStager:
     return SQLUploadStager()
+@pytest.fixture
+def mock_uploader(mocker: MockerFixture) -> SQLUploader:
+    mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
+    mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
+    return SQLUploader(
+        upload_config=mock_upload_config,
+        connection_config=mock_connection_config,
+        connector_type="sql_test",
+    )
 @pytest.mark.parametrize(
     ("input_filepath", "output_filename", "expected"),
     [
@@ -72,3 +89,64 @@ def test_run_output_filename_suffix(
         path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
     )
     assert result.name == expected
+def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
+    df = pd.DataFrame(
+        {
+            "col1": [1, 2],
+            "col2": [3, 4],
+            "col3": [5, 6],
+        }
+    )
+    mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
+    result = mock_uploader._fit_to_schema(df)
+    assert "col3" not in result.columns
+    assert "col1" in result.columns
+    assert "col2" in result.columns
+def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
+    df = pd.DataFrame(
+        {
+            "col1": [1, 2],
+        }
+    )
+    mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
+    result = mock_uploader._fit_to_schema(df)
+    assert "col2" in result.columns
+    assert result["col2"].isnull().all()
+def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
+    df = pd.DataFrame(
+        {
+            "col1": [1, 2],
+            "col2": [3, 4],
+        }
+    )
+    mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
+    result = mock_uploader._fit_to_schema(df)
+    assert "col1" in result.columns
+    assert "col2" in result.columns
+    assert result.equals(df)
+def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
+    df = pd.DataFrame(
+        {
+            "col1": [1, 2],
+        }
+    )
+    mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
+    result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
+    assert "col2" not in result.columns
+    assert "col1" in result.columns

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.18" # pragma: no cover
1	+ __version__ = "0.5.20" # pragma: no cover

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ import unstructured_ingest.v2.processes.connectors.databricks  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.duckdb  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.elasticsearch  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.fsspec  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.ibm_watsonx  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.kafka  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.lancedb  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.qdrant  # noqa: F401
@@ -121,4 +122,5 @@ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
 add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
 add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
 add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)

unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from __future__ import annotations
+from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
+from .ibm_watsonx_s3 import CONNECTOR_TYPE as IBM_WATSONX_S3_CONNECTOR_TYPE
+from .ibm_watsonx_s3 import ibm_watsonx_s3_destination_entry
+add_destination_entry(
+    destination_type=IBM_WATSONX_S3_CONNECTOR_TYPE, entry=ibm_watsonx_s3_destination_entry
+)

unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py ADDED Viewed

@@ -0,0 +1,301 @@
+import logging
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
+import pandas as pd
+from pydantic import Field, Secret
+from unstructured_ingest.utils.data_prep import get_data_df
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
+from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    FileData,
+    UploaderConfig,
+)
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connector_registry import (
+    DestinationRegistryEntry,
+)
+from unstructured_ingest.v2.processes.connectors.sql.sql import (
+    SQLUploader,
+    SQLUploadStager,
+    SQLUploadStagerConfig,
+)
+if TYPE_CHECKING:
+    from pyarrow import Table as ArrowTable
+    from pyiceberg.catalog.rest import RestCatalog
+    from pyiceberg.table import Table, Transaction
+CONNECTOR_TYPE = "ibm_watsonx_s3"
+DEFAULT_IBM_CLOUD_AUTH_URL = "https://iam.cloud.ibm.com/identity/token"
+DEFAULT_ICEBERG_URI_PATH = "/mds/iceberg"
+DEFAULT_ICEBERG_CATALOG_TYPE = "rest"
+class IcebergCommitFailedException(Exception):
+    """Failed to commit changes to the iceberg table."""
+class IbmWatsonxAccessConfig(AccessConfig):
+    iam_api_key: str = Field(description="IBM IAM API Key")
+    access_key_id: str = Field(description="Cloud Object Storage HMAC Access Key ID")
+    secret_access_key: str = Field(description="Cloud Object Storage HMAC Secret Access Key")
+class IbmWatsonxConnectionConfig(ConnectionConfig):
+    access_config: Secret[IbmWatsonxAccessConfig]
+    iceberg_endpoint: str = Field(description="Iceberg REST endpoint")
+    object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
+    object_storage_region: str = Field(description="Cloud Object Storage region")
+    catalog: str = Field(description="Catalog name")
+    _bearer_token: Optional[dict[str, Any]] = None
+    @property
+    def iceberg_url(self) -> str:
+        return f"https://{self.iceberg_endpoint.strip('/')}{DEFAULT_ICEBERG_URI_PATH}"
+    @property
+    def object_storage_url(self) -> str:
+        return f"https://{self.object_storage_endpoint.strip('/')}"
+    @property
+    def bearer_token(self) -> str:
+        # Add 60 seconds to deal with edge cases where the token expires before the request is made
+        timestamp = int(time.time()) + 60
+        if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
+            self._bearer_token = self.generate_bearer_token()
+        return self._bearer_token["access_token"]
+    @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
+    def wrap_error(self, e: Exception) -> Exception:
+        import httpx
+        if not isinstance(e, httpx.HTTPStatusError):
+            logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
+            return e
+        url = e.request.url
+        response_code = e.response.status_code
+        if response_code == 401:
+            logger.error(
+                f"Failed to authenticate IBM watsonx.data user {url}, status code {response_code}"
+            )
+            return UserAuthError(e)
+        if response_code == 403:
+            logger.error(
+                f"Given IBM watsonx.data user is not authorized {url}, status code {response_code}"
+            )
+            return UserAuthError(e)
+        if 400 <= response_code < 500:
+            logger.error(
+                f"Request to {url} failed"
+                f"in IBM watsonx.data connector, status code {response_code}"
+            )
+            return UserError(e)
+        if response_code > 500:
+            logger.error(
+                f"Request to {url} failed"
+                f"in IBM watsonx.data connector, status code {response_code}"
+            )
+            return ProviderError(e)
+        logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
+        return e
+    @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
+    def generate_bearer_token(self) -> dict[str, Any]:
+        import httpx
+        headers = {
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Accept": "application/json",
+        }
+        data = {
+            "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
+            "apikey": self.access_config.get_secret_value().iam_api_key,
+        }
+        logger.info("Generating IBM IAM Bearer Token")
+        try:
+            response = httpx.post(DEFAULT_IBM_CLOUD_AUTH_URL, headers=headers, data=data)
+            response.raise_for_status()
+        except Exception as e:
+            raise self.wrap_error(e)
+        return response.json()
+    def get_catalog_config(self) -> dict[str, Any]:
+        return {
+            "name": self.catalog,
+            "type": DEFAULT_ICEBERG_CATALOG_TYPE,
+            "uri": self.iceberg_url,
+            "token": self.bearer_token,
+            "warehouse": self.catalog,
+            "s3.endpoint": self.object_storage_url,
+            "s3.access-key-id": self.access_config.get_secret_value().access_key_id,
+            "s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
+            "s3.region": self.object_storage_region,
+        }
+    @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
+    @contextmanager
+    def get_catalog(self) -> Generator["RestCatalog", None, None]:
+        from pyiceberg.catalog import load_catalog
+        try:
+            catalog_config = self.get_catalog_config()
+            catalog = load_catalog(**catalog_config)
+        except Exception as e:
+            logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
+            raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
+        yield catalog
+@dataclass
+class IbmWatsonxUploadStagerConfig(SQLUploadStagerConfig):
+    pass
+@dataclass
+class IbmWatsonxUploadStager(SQLUploadStager):
+    upload_stager_config: IbmWatsonxUploadStagerConfig = field(
+        default_factory=IbmWatsonxUploadStagerConfig
+    )
+class IbmWatsonxUploaderConfig(UploaderConfig):
+    namespace: str = Field(description="Namespace name")
+    table: str = Field(description="Table name")
+    max_retries: int = Field(
+        default=5, description="Maximum number of retries to upload data", ge=2, le=10
+    )
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="Searchable key to find entries for the same record on previous runs",
+    )
+    @property
+    def table_identifier(self) -> Tuple[str, str]:
+        return (self.namespace, self.table)
+@dataclass
+class IbmWatsonxUploader(SQLUploader):
+    connection_config: IbmWatsonxConnectionConfig
+    upload_config: IbmWatsonxUploaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        with self.connection_config.get_catalog() as catalog:
+            if not catalog.namespace_exists(self.upload_config.namespace):
+                raise UserError(f"Namespace '{self.upload_config.namespace}' does not exist")
+            if not catalog.table_exists(self.upload_config.table_identifier):
+                raise UserError(
+                    f"Table '{self.upload_config.table}' does not exist in namespace '{self.upload_config.namespace}'"  # noqa: E501
+                )
+    @contextmanager
+    def get_table(self) -> Generator["Table", None, None]:
+        with self.connection_config.get_catalog() as catalog:
+            table = catalog.load_table(self.upload_config.table_identifier)
+            yield table
+    def get_table_columns(self) -> list[str]:
+        if self._columns is None:
+            with self.get_table() as table:
+                self._columns = table.schema().column_names
+        return self._columns
+    def can_delete(self) -> bool:
+        return self.upload_config.record_id_key in self.get_table_columns()
+    @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
+    def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
+        import pyarrow as pa
+        # Iceberg will automatically fill missing columns with nulls
+        # Iceberg will throw an error if the DataFrame column has only null values
+        # because it can't infer the type of the column and match it with the table schema
+        return pa.Table.from_pandas(self._fit_to_schema(df, add_missing_columns=False))
+    @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
+    def _delete(self, transaction: "Transaction", identifier: str) -> None:
+        from pyiceberg.expressions import EqualTo
+        if self.can_delete():
+            transaction.delete(delete_filter=EqualTo(self.upload_config.record_id_key, identifier))
+        else:
+            logger.warning(
+                f"Table doesn't contain expected "
+                f"record id column "
+                f"{self.upload_config.record_id_key}, skipping delete"
+            )
+    @requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
+    def upload_data_table(
+        self, table: "Table", data_table: "ArrowTable", file_data: FileData
+    ) -> None:
+        from pyiceberg.exceptions import CommitFailedException
+        from tenacity import (
+            before_log,
+            retry,
+            retry_if_exception_type,
+            stop_after_attempt,
+            wait_random,
+        )
+        @retry(
+            stop=stop_after_attempt(self.upload_config.max_retries),
+            wait=wait_random(),
+            retry=retry_if_exception_type(IcebergCommitFailedException),
+            before=before_log(logger, logging.DEBUG),
+            reraise=True,
+        )
+        def _upload_data_table(table: "Table", data_table: "ArrowTable", file_data: FileData):
+            try:
+                with table.transaction() as transaction:
+                    self._delete(transaction, file_data.identifier)
+                    transaction.append(data_table)
+            except CommitFailedException as e:
+                table.refresh()
+                logger.debug(e)
+                raise IcebergCommitFailedException(e)
+            except Exception as e:
+                raise ProviderError(f"Failed to upload data to table: {e}")
+        try:
+            return _upload_data_table(table, data_table, file_data)
+        except ProviderError:
+            raise
+        except Exception as e:
+            raise ProviderError(f"Failed to upload data to table: {e}")
+    def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
+        data_table = self._df_to_arrow_table(df)
+        with self.get_table() as table:
+            self.upload_data_table(table, data_table, file_data)
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        df = pd.DataFrame(data)
+        self.upload_dataframe(df=df, file_data=file_data)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        df = get_data_df(path=path)
+        self.upload_dataframe(df=df, file_data=file_data)
+ibm_watsonx_s3_destination_entry = DestinationRegistryEntry(
+    connection_config=IbmWatsonxConnectionConfig,
+    uploader=IbmWatsonxUploader,
+    uploader_config=IbmWatsonxUploaderConfig,
+    upload_stager=IbmWatsonxUploadStager,
+    upload_stager_config=IbmWatsonxUploadStagerConfig,
+)

unstructured_ingest/v2/processes/connectors/redisdb.py CHANGED Viewed

@@ -110,6 +110,7 @@ class RedisConnectionConfig(ConnectionConfig):
 class RedisUploaderConfig(UploaderConfig):
     batch_size: int = Field(default=100, description="Number of records per batch")
+    key_prefix: str = Field(default="", description="Prefix for Redis keys")
 @dataclass
@@ -145,11 +146,11 @@ class RedisUploader(Uploader):
         async with self.connection_config.create_async_client() as async_client:
             async with async_client.pipeline(transaction=True) as pipe:
                 for element in batch:
-                    element_id = element["element_id"]
+                    key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
                     if redis_stack:
-                        pipe.json().set(element_id, "$", element)
+                        pipe.json().set(key_with_prefix, "$", element)
                     else:
-                        pipe.set(element_id, json.dumps(element))
+                        pipe.set(key_with_prefix, json.dumps(element))
                 await pipe.execute()
     @requires_dependencies(["redis"], extras="redis")
@@ -159,16 +160,16 @@ class RedisUploader(Uploader):
         redis_stack = True
         async with self.connection_config.create_async_client() as async_client:
             async with async_client.pipeline(transaction=True) as pipe:
-                element_id = element["element_id"]
+                key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
                 try:
                     # Redis with stack extension supports JSON type
-                    await pipe.json().set(element_id, "$", element).execute()
+                    await pipe.json().set(key_with_prefix, "$", element).execute()
                 except redis_exceptions.ResponseError as e:
                     message = str(e)
                     if "unknown command `JSON.SET`" in message:
                         # if this error occurs, Redis server doesn't support JSON type,
                         # so save as string type instead
-                        await pipe.set(element_id, json.dumps(element)).execute()
+                        await pipe.set(key_with_prefix, json.dumps(element)).execute()
                         redis_stack = False
                     else:
                         raise e

unstructured_ingest/v2/processes/connectors/sql/sql.py CHANGED Viewed

@@ -323,7 +323,7 @@ class SQLUploader(Uploader):
             output.append(tuple(parsed))
         return output
-    def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
+    def _fit_to_schema(self, df: pd.DataFrame, add_missing_columns: bool = True) -> pd.DataFrame:
         table_columns = self.get_table_columns()
         columns = set(df.columns)
         schema_fields = set(table_columns)
@@ -335,7 +335,7 @@ class SQLUploader(Uploader):
                 "Following columns will be dropped to match the table's schema: "
                 f"{', '.join(columns_to_drop)}"
             )
-        if missing_columns:
+        if missing_columns and add_missing_columns:
             logger.info(
                 "Following null filled columns will be added to match the table's schema:"
                 f" {', '.join(missing_columns)} "
@@ -343,8 +343,9 @@ class SQLUploader(Uploader):
         df = df.drop(columns=columns_to_drop)
-        for column in missing_columns:
-            df[column] = pd.Series()
+        if add_missing_columns:
+            for column in missing_columns:
+                df[column] = pd.Series()
         return df
     def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:

{unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: unstructured-ingest
-Version: 0.5.18
+Version: 0.5.20
 Summary: A library that prepares raw documents for downstream ML tasks.
 Home-page: https://github.com/Unstructured-IO/unstructured-ingest
 Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.14
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
 Requires-Dist: python-dateutil
+Requires-Dist: click
 Requires-Dist: opentelemetry-sdk
+Requires-Dist: pydantic>=2.7
 Requires-Dist: pandas
-Requires-Dist: dataclasses_json
 Requires-Dist: tqdm
-Requires-Dist: click
-Requires-Dist: pydantic>=2.7
+Requires-Dist: dataclasses_json
 Provides-Extra: remote
 Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
 Provides-Extra: csv
@@ -86,8 +86,8 @@ Requires-Dist: atlassian-python-api; extra == "confluence"
 Provides-Extra: couchbase
 Requires-Dist: couchbase; extra == "couchbase"
 Provides-Extra: delta-table
-Requires-Dist: deltalake; extra == "delta-table"
 Requires-Dist: boto3; extra == "delta-table"
+Requires-Dist: deltalake; extra == "delta-table"
 Provides-Extra: discord
 Requires-Dist: discord.py; extra == "discord"
 Provides-Extra: dropbox
@@ -99,8 +99,8 @@ Provides-Extra: elasticsearch
 Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
 Provides-Extra: gcs
 Requires-Dist: bs4; extra == "gcs"
-Requires-Dist: fsspec; extra == "gcs"
 Requires-Dist: gcsfs; extra == "gcs"
+Requires-Dist: fsspec; extra == "gcs"
 Provides-Extra: github
 Requires-Dist: pygithub>1.58.0; extra == "github"
 Requires-Dist: requests; extra == "github"
@@ -109,8 +109,13 @@ Requires-Dist: python-gitlab; extra == "gitlab"
 Provides-Extra: google-drive
 Requires-Dist: google-api-python-client; extra == "google-drive"
 Provides-Extra: hubspot
-Requires-Dist: urllib3; extra == "hubspot"
 Requires-Dist: hubspot-api-client; extra == "hubspot"
+Requires-Dist: urllib3; extra == "hubspot"
+Provides-Extra: ibm-watsonx-s3
+Requires-Dist: httpx; extra == "ibm-watsonx-s3"
+Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
+Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
+Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
 Provides-Extra: jira
 Requires-Dist: atlassian-python-api; extra == "jira"
 Provides-Extra: kafka
@@ -125,13 +130,13 @@ Provides-Extra: mongodb
 Requires-Dist: pymongo; extra == "mongodb"
 Provides-Extra: neo4j
 Requires-Dist: networkx; extra == "neo4j"
-Requires-Dist: neo4j-rust-ext; extra == "neo4j"
 Requires-Dist: cymple; extra == "neo4j"
+Requires-Dist: neo4j-rust-ext; extra == "neo4j"
 Provides-Extra: notion
+Requires-Dist: htmlBuilder; extra == "notion"
+Requires-Dist: notion-client; extra == "notion"
 Requires-Dist: httpx; extra == "notion"
 Requires-Dist: backoff; extra == "notion"
-Requires-Dist: notion-client; extra == "notion"
-Requires-Dist: htmlBuilder; extra == "notion"
 Provides-Extra: onedrive
 Requires-Dist: bs4; extra == "onedrive"
 Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
@@ -152,8 +157,8 @@ Requires-Dist: praw; extra == "reddit"
 Provides-Extra: redis
 Requires-Dist: redis; extra == "redis"
 Provides-Extra: s3
-Requires-Dist: fsspec; extra == "s3"
 Requires-Dist: s3fs; extra == "s3"
+Requires-Dist: fsspec; extra == "s3"
 Provides-Extra: sharepoint
 Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
 Requires-Dist: msal; extra == "sharepoint"
@@ -165,8 +170,8 @@ Requires-Dist: fsspec; extra == "sftp"
 Provides-Extra: slack
 Requires-Dist: slack_sdk[optional]; extra == "slack"
 Provides-Extra: snowflake
-Requires-Dist: psycopg2-binary; extra == "snowflake"
 Requires-Dist: snowflake-connector-python; extra == "snowflake"
+Requires-Dist: psycopg2-binary; extra == "snowflake"
 Provides-Extra: wikipedia
 Requires-Dist: wikipedia; extra == "wikipedia"
 Provides-Extra: weaviate
@@ -178,17 +183,17 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
 Provides-Extra: singlestore
 Requires-Dist: singlestoredb; extra == "singlestore"
 Provides-Extra: vectara
-Requires-Dist: aiofiles; extra == "vectara"
 Requires-Dist: httpx; extra == "vectara"
+Requires-Dist: aiofiles; extra == "vectara"
 Requires-Dist: requests; extra == "vectara"
 Provides-Extra: vastdb
-Requires-Dist: vastdb; extra == "vastdb"
 Requires-Dist: pyarrow; extra == "vastdb"
+Requires-Dist: vastdb; extra == "vastdb"
 Requires-Dist: ibis; extra == "vastdb"
 Provides-Extra: zendesk
+Requires-Dist: bs4; extra == "zendesk"
 Requires-Dist: aiofiles; extra == "zendesk"
 Requires-Dist: httpx; extra == "zendesk"
-Requires-Dist: bs4; extra == "zendesk"
 Provides-Extra: embed-huggingface
 Requires-Dist: sentence-transformers; extra == "embed-huggingface"
 Provides-Extra: embed-octoai

{unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/RECORD RENAMED Viewed

@@ -21,7 +21,7 @@ test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qko
 test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
 test/integration/connectors/test_pinecone.py,sha256=9FC0frer7gtDzk5A6OhGsV8S4ggYfa5ReEO9t7L3Am0,13649
 test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
-test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
+test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
 test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
 test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
 test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
@@ -93,10 +93,12 @@ test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vP
 test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
 test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
+test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=gvgF9vCA_cPQVS_IC6VFvnP4ojFVKOH7eorM6k5VR84,14518
 test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/unit/v2/connectors/motherduck/test_base.py,sha256=f3W7hppEZ904_I_fKax-5LVDp-0yj04DjF1ccZ4k5O8,2503
 test/unit/v2/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-test/unit/v2/connectors/sql/test_sql.py,sha256=51-AKUBxw6ThO68bjenLopUUuxM88YZb2rMUV8L6YwY,2464
+test/unit/v2/connectors/sql/test_sql.py,sha256=wA5LvLtmaCi-8YDOd515j3YnP0_E4qi7z50NFXBn75g,4634
 test/unit/v2/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/unit/v2/embedders/test_bedrock.py,sha256=HMaweO_v_9Y1SE2m5QImXP73cb26vNTUfc1onTBa1-g,1074
 test/unit/v2/embedders/test_huggingface.py,sha256=TOHUKC7hAadl6XTotr8UqOCq28kbQxOIkPSrMxr2PLU,1546
@@ -111,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
 test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=QYn6GUOSyCz_KH2wi4yg_FlUU4SE844Xhf0hR6-jv8s,43
+unstructured_ingest/__version__.py,sha256=BCszjb86jsmMjfakEG2zLAZFKHpLYTR2k5TCe7RzaBc,43
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
 unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -426,7 +428,7 @@ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8g
 unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
 unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
 unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
-unstructured_ingest/v2/processes/connectors/__init__.py,sha256=ebLvZes84qRx4eS20SkvlVH6WIIM76hifyUgkUJ-dfg,6588
+unstructured_ingest/v2/processes/connectors/__init__.py,sha256=l4Xq4AuzRMTqUv5TU7cE1NbhGCka4SFJFZwG1FoVotE,6666
 unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
 unstructured_ingest/v2/processes/connectors/astradb.py,sha256=5xc5pWFicE_-2BV38oK-nnzAMI2EzF-q8XAqQ3qPUR8,18249
 unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
@@ -446,7 +448,7 @@ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=I-eDLAlThHKKFQfkZpQL
 unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=5rg7t40gKxDHNcuJrJHmVzJ9uM7Ct4RBOvFsfwdGc5c,18002
 unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
 unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
-unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
+unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=0h105_MpOO4-uydiyHgM4TvduSkAMAr931KFANcKW8Y,6936
 unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
 unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
 unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
@@ -477,6 +479,8 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5B
 unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
 unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
 unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
+unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
+unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=zaj5MVsM-uf7IRgZGg7QwRtzjtTM1gCYuqji61TrqWk,11562
 unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
 unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
 unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=UfS41jzV9VxekS6AwWHhURJmJ7RUAw5iiIrj75BWrXQ,10255
@@ -564,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
 unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
 unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
 unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=0hfiX_u7V38k_RfoeDmXJp8WIHZ19ilIHnrgZVSleKw,9270
-unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
+unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=wtVK6CHrQ4McwsPifUoa7KKaY-v0cjDZJetASSAaSIA,15415
 unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
 unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
 unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
@@ -577,9 +581,9 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
 unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
 unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
-unstructured_ingest-0.5.18.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-0.5.18.dist-info/METADATA,sha256=K47-NP1RfNwqRnvbZ8vO75ab5J5RSmb5nocwSXNwqko,8465
-unstructured_ingest-0.5.18.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-unstructured_ingest-0.5.18.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-0.5.18.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
-unstructured_ingest-0.5.18.dist-info/RECORD,,
+unstructured_ingest-0.5.20.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-0.5.20.dist-info/METADATA,sha256=S2Yr62sVeW0csT-QRyonnokiHFvvH0FAwQ2x02BqAeM,8697
+unstructured_ingest-0.5.20.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+unstructured_ingest-0.5.20.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-0.5.20.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
+unstructured_ingest-0.5.20.dist-info/RECORD,,

{unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/top_level.txt RENAMED Viewed

File without changes

unstructured-ingest 0.5.18__py3-none-any.whl → 0.5.20__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.5.18py3-none-any.whl → 0.5.20py3-none-any.whl