PyPI - unstructured-ingest - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show

test/integration/connectors/sql/test_snowflake.py CHANGED Viewed

@@ -1,16 +1,19 @@
+import json
 import os
-import tempfile
 from pathlib import Path
-import docker
-import pandas as pd
 import pytest
 import snowflake.connector as sf
+from _pytest.fixtures import TopRequest
 from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
 from test.integration.connectors.utils.docker import container_context
-from test.integration.connectors.utils.validation import (
-    ValidationConfigs,
+from test.integration.connectors.utils.validation.destination import (
+    StagerValidationConfigs,
+    stager_validation,
+)
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
     source_connector_validation,
 )
 from test.integration.utils import requires_env
@@ -30,14 +33,15 @@ from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
 SEED_DATA_ROWS = 20
-def seed_data():
-    conn = sf.connect(
-        user="test",
-        password="test",
-        account="test",
-        database="test",
-        host="snowflake.localhost.localstack.cloud",
-    )
+def seed_data() -> dict:
+    connect_params = {
+        "user": "test",
+        "password": "test",
+        "account": "test",
+        "database": "test",
+        "host": "snowflake.localhost.localstack.cloud",
+    }
+    conn = sf.connect(**connect_params)
     file = Path(env_setup_path / "sql" / "snowflake" / "source" / "snowflake-schema.sql")
@@ -52,16 +56,31 @@ def seed_data():
     cur.close()
     conn.close()
+    return connect_params
-def init_db_destination():
-    conn = sf.connect(
-        user="test",
-        password="test",
-        account="test",
-        database="test",
-        host="snowflake.localhost.localstack.cloud",
-    )
+@pytest.fixture
+def source_database_setup() -> dict:
+    token = os.getenv("LOCALSTACK_AUTH_TOKEN")
+    with container_context(
+        image="localstack/snowflake",
+        environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
+        ports={4566: 4566, 443: 443},
+        healthcheck_retries=30,
+    ):
+        connect_params = seed_data()
+        yield connect_params
+def init_db_destination() -> dict:
+    connect_params = {
+        "user": "test",
+        "password": "test",
+        "account": "test",
+        "database": "test",
+        "host": "snowflake.localhost.localstack.cloud",
+    }
+    conn = sf.connect(**connect_params)
     file = Path(env_setup_path / "sql" / "snowflake" / "destination" / "snowflake-schema.sql")
@@ -73,52 +92,53 @@ def init_db_destination():
     cur.close()
     conn.close()
+    return connect_params
-@pytest.mark.asyncio
-@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
-@requires_env("LOCALSTACK_AUTH_TOKEN")
-async def test_snowflake_source():
-    docker_client = docker.from_env()
+@pytest.fixture
+def destination_database_setup() -> dict:
     token = os.getenv("LOCALSTACK_AUTH_TOKEN")
     with container_context(
-        docker_client=docker_client,
         image="localstack/snowflake",
         environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
         ports={4566: 4566, 443: 443},
         healthcheck_retries=30,
     ):
-        seed_data()
-        with tempfile.TemporaryDirectory() as tmpdir:
-            connection_config = SnowflakeConnectionConfig(
-                access_config=SnowflakeAccessConfig(password="test"),
-                account="test",
-                user="test",
-                database="test",
-                host="snowflake.localhost.localstack.cloud",
-            )
-            indexer = SnowflakeIndexer(
-                connection_config=connection_config,
-                index_config=SnowflakeIndexerConfig(
-                    table_name="cars", id_column="CAR_ID", batch_size=5
-                ),
-            )
-            downloader = SnowflakeDownloader(
-                connection_config=connection_config,
-                download_config=SnowflakeDownloaderConfig(
-                    fields=["CAR_ID", "BRAND"], download_dir=Path(tmpdir)
-                ),
-            )
-            await source_connector_validation(
-                indexer=indexer,
-                downloader=downloader,
-                configs=ValidationConfigs(
-                    test_id="snowflake",
-                    expected_num_files=SEED_DATA_ROWS,
-                    expected_number_indexed_file_data=4,
-                    validate_downloaded_files=True,
-                ),
-            )
+        connect_params = init_db_destination()
+        yield connect_params
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
+@requires_env("LOCALSTACK_AUTH_TOKEN")
+async def test_snowflake_source(temp_dir: Path, source_database_setup: dict):
+    connection_config = SnowflakeConnectionConfig(
+        access_config=SnowflakeAccessConfig(password="test"),
+        account="test",
+        user="test",
+        database="test",
+        host="snowflake.localhost.localstack.cloud",
+    )
+    indexer = SnowflakeIndexer(
+        connection_config=connection_config,
+        index_config=SnowflakeIndexerConfig(table_name="cars", id_column="CAR_ID", batch_size=5),
+    )
+    downloader = SnowflakeDownloader(
+        connection_config=connection_config,
+        download_config=SnowflakeDownloaderConfig(
+            fields=["CAR_ID", "BRAND"], download_dir=temp_dir
+        ),
+    )
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id="snowflake",
+            expected_num_files=SEED_DATA_ROWS,
+            expected_number_indexed_file_data=4,
+            validate_downloaded_files=True,
+        ),
+    )
 def validate_destination(
@@ -145,65 +165,70 @@ def validate_destination(
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
 @requires_env("LOCALSTACK_AUTH_TOKEN")
-async def test_snowflake_destination(upload_file: Path):
+async def test_snowflake_destination(
+    upload_file: Path, temp_dir: Path, destination_database_setup: dict
+):
     # the postgres destination connector doesn't leverage the file data but is required as an input,
     # mocking it with arbitrary values to meet the base requirements:
     mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
-    docker_client = docker.from_env()
-    token = os.getenv("LOCALSTACK_AUTH_TOKEN")
-    with container_context(
-        docker_client=docker_client,
-        image="localstack/snowflake",
-        environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
-        ports={4566: 4566, 443: 443},
-        healthcheck_retries=30,
-    ):
-        init_db_destination()
-        with tempfile.TemporaryDirectory() as tmpdir:
-            stager = SnowflakeUploadStager()
-            stager_params = {
-                "elements_filepath": upload_file,
-                "file_data": mock_file_data,
-                "output_dir": Path(tmpdir),
-                "output_filename": "test_db",
-            }
-            if stager.is_async():
-                staged_path = await stager.run_async(**stager_params)
-            else:
-                staged_path = stager.run(**stager_params)
-            # The stager should append the `.json` suffix to the output filename passed in.
-            assert staged_path.name == "test_db.json"
-            connect_params = {
-                "user": "test",
-                "password": "test",
-                "account": "test",
-                "database": "test",
-                "host": "snowflake.localhost.localstack.cloud",
-            }
-            uploader = SnowflakeUploader(
-                connection_config=SnowflakeConnectionConfig(
-                    access_config=SnowflakeAccessConfig(password=connect_params["password"]),
-                    account=connect_params["account"],
-                    user=connect_params["user"],
-                    database=connect_params["database"],
-                    host=connect_params["host"],
-                )
-            )
-            uploader.run(path=staged_path, file_data=mock_file_data)
-            staged_df = pd.read_json(staged_path, orient="records", lines=True)
-            expected_num_elements = len(staged_df)
-            validate_destination(
-                connect_params=connect_params,
-                expected_num_elements=expected_num_elements,
-            )
-            uploader.run(path=staged_path, file_data=mock_file_data)
-            validate_destination(
-                connect_params=connect_params,
-                expected_num_elements=expected_num_elements,
-            )
+    init_db_destination()
+    stager = SnowflakeUploadStager()
+    staged_path = stager.run(
+        elements_filepath=upload_file,
+        file_data=mock_file_data,
+        output_dir=temp_dir,
+        output_filename=upload_file.name,
+    )
+    # The stager should append the `.json` suffix to the output filename passed in.
+    assert staged_path.suffix == upload_file.suffix
+    connect_params = {
+        "user": "test",
+        "password": "test",
+        "account": "test",
+        "database": "test",
+        "host": "snowflake.localhost.localstack.cloud",
+    }
+    uploader = SnowflakeUploader(
+        connection_config=SnowflakeConnectionConfig(
+            access_config=SnowflakeAccessConfig(password=connect_params["password"]),
+            account=connect_params["account"],
+            user=connect_params["user"],
+            database=connect_params["database"],
+            host=connect_params["host"],
+        )
+    )
+    uploader.precheck()
+    uploader.run(path=staged_path, file_data=mock_file_data)
+    with staged_path.open("r") as f:
+        staged_data = json.load(f)
+    expected_num_elements = len(staged_data)
+    validate_destination(
+        connect_params=connect_params,
+        expected_num_elements=expected_num_elements,
+    )
+    uploader.run(path=staged_path, file_data=mock_file_data)
+    validate_destination(
+        connect_params=connect_params,
+        expected_num_elements=expected_num_elements,
+    )
+@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
+def test_snowflake_stager(
+    request: TopRequest,
+    upload_file_str: str,
+    tmp_path: Path,
+):
+    upload_file: Path = request.getfixturevalue(upload_file_str)
+    stager = SnowflakeUploadStager()
+    stager_validation(
+        configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
+        input_file=upload_file,
+        stager=stager,
+        tmp_dir=tmp_path,
+    )

test/integration/connectors/sql/test_sqlite.py CHANGED Viewed

@@ -1,14 +1,18 @@
+import json
 import sqlite3
 import tempfile
-from contextlib import contextmanager
 from pathlib import Path
-import pandas as pd
 import pytest
+from _pytest.fixtures import TopRequest
 from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
-from test.integration.connectors.utils.validation import (
-    ValidationConfigs,
+from test.integration.connectors.utils.validation.destination import (
+    StagerValidationConfigs,
+    stager_validation,
+)
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
     source_connector_validation,
 )
 from unstructured_ingest.v2.interfaces import FileData
@@ -23,11 +27,11 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
     SQLiteUploadStager,
 )
-SEED_DATA_ROWS = 20
+SEED_DATA_ROWS = 10
-@contextmanager
-def sqlite_download_setup() -> Path:
+@pytest.fixture
+def source_database_setup() -> Path:
     with tempfile.TemporaryDirectory() as tmpdir:
         db_path = Path(tmpdir) / "mock_database.db"
         db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
@@ -49,49 +53,42 @@ def sqlite_download_setup() -> Path:
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
-async def test_sqlite_source():
-    with sqlite_download_setup() as db_path:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            connection_config = SQLiteConnectionConfig(database_path=db_path)
-            indexer = SQLiteIndexer(
-                connection_config=connection_config,
-                index_config=SQLiteIndexerConfig(
-                    table_name="cars", id_column="car_id", batch_size=5
-                ),
-            )
-            downloader = SQLiteDownloader(
-                connection_config=connection_config,
-                download_config=SQLiteDownloaderConfig(
-                    fields=["car_id", "brand"], download_dir=Path(tmpdir)
-                ),
-            )
-            await source_connector_validation(
-                indexer=indexer,
-                downloader=downloader,
-                configs=ValidationConfigs(
-                    test_id="sqlite",
-                    expected_num_files=SEED_DATA_ROWS,
-                    expected_number_indexed_file_data=4,
-                    validate_downloaded_files=True,
-                ),
-            )
-@contextmanager
-def sqlite_upload_setup() -> Path:
+async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
+    connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
+    indexer = SQLiteIndexer(
+        connection_config=connection_config,
+        index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
+    )
+    downloader = SQLiteDownloader(
+        connection_config=connection_config,
+        download_config=SQLiteDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
+    )
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id="sqlite",
+            expected_num_files=SEED_DATA_ROWS,
+            expected_number_indexed_file_data=2,
+            validate_downloaded_files=True,
+        ),
+    )
+@pytest.fixture
+def destination_database_setup(temp_dir: Path) -> Path:
     # Provision the local file that sqlite points to to have the desired schema for the integration
     # tests and make sure the file and connection get cleaned up by using a context manager.
-    with tempfile.TemporaryDirectory() as tmpdir:
-        db_path = Path(tmpdir) / "elements.db"
-        db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
-        assert db_init_path.exists()
-        assert db_init_path.is_file()
-        with sqlite3.connect(database=db_path) as sqlite_connection:
-            with db_init_path.open("r") as f:
-                query = f.read()
-            cursor = sqlite_connection.cursor()
-            cursor.executescript(query)
-        yield db_path
+    db_path = temp_dir / "elements.db"
+    db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
+    assert db_init_path.exists()
+    assert db_init_path.is_file()
+    with sqlite3.connect(database=db_path) as sqlite_connection:
+        with db_init_path.open("r") as f:
+            query = f.read()
+        cursor = sqlite_connection.cursor()
+        cursor.executescript(query)
+    return db_path
 def validate_destination(db_path: Path, expected_num_elements: int):
@@ -114,34 +111,48 @@ def validate_destination(db_path: Path, expected_num_elements: int):
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
-async def test_sqlite_destination(upload_file: Path):
+async def test_sqlite_destination(
+    upload_file: Path, temp_dir: Path, destination_database_setup: Path
+):
     # the sqlite destination connector doesn't leverage the file data but is required as an input,
     # mocking it with arbitrary values to meet the base requirements:
     mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
-    with sqlite_upload_setup() as db_path:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            stager = SQLiteUploadStager()
-            stager_params = {
-                "elements_filepath": upload_file,
-                "file_data": mock_file_data,
-                "output_dir": Path(tmpdir),
-                "output_filename": "test_db",
-            }
-            if stager.is_async():
-                staged_path = await stager.run_async(**stager_params)
-            else:
-                staged_path = stager.run(**stager_params)
-            # The stager should append the `.json` suffix to the output filename passed in.
-            assert staged_path.name == "test_db.json"
-            uploader = SQLiteUploader(
-                connection_config=SQLiteConnectionConfig(database_path=db_path)
-            )
-            uploader.run(path=staged_path, file_data=mock_file_data)
-            staged_df = pd.read_json(staged_path, orient="records", lines=True)
-            validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
-            uploader.run(path=staged_path, file_data=mock_file_data)
-            validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
+    stager = SQLiteUploadStager()
+    staged_path = stager.run(
+        elements_filepath=upload_file,
+        file_data=mock_file_data,
+        output_dir=temp_dir,
+        output_filename=upload_file.name,
+    )
+    # The stager should append the `.json` suffix to the output filename passed in.
+    assert staged_path.suffix == upload_file.suffix
+    uploader = SQLiteUploader(
+        connection_config=SQLiteConnectionConfig(database_path=destination_database_setup)
+    )
+    uploader.precheck()
+    uploader.run(path=staged_path, file_data=mock_file_data)
+    with staged_path.open("r") as f:
+        staged_data = json.load(f)
+    validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
+    uploader.run(path=staged_path, file_data=mock_file_data)
+    validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
+@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
+def test_sqlite_stager(
+    request: TopRequest,
+    upload_file_str: str,
+    tmp_path: Path,
+):
+    upload_file: Path = request.getfixturevalue(upload_file_str)
+    stager = SQLiteUploadStager()
+    stager_validation(
+        configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
+        input_file=upload_file,
+        stager=stager,
+        tmp_dir=tmp_path,
+    )

test/integration/connectors/test_astradb.py CHANGED Viewed

@@ -5,16 +5,27 @@ from pathlib import Path
 from uuid import uuid4
 import pytest
+from _pytest.fixtures import TopRequest
 from astrapy import Collection
 from astrapy import DataAPIClient as AstraDBClient
 from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
+from test.integration.connectors.utils.validation.destination import (
+    StagerValidationConfigs,
+    stager_validation,
+)
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
+    source_connector_validation,
+)
 from test.integration.utils import requires_env
 from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.astradb import (
     CONNECTOR_TYPE,
     AstraDBAccessConfig,
     AstraDBConnectionConfig,
+    AstraDBDownloader,
+    AstraDBDownloaderConfig,
     AstraDBIndexer,
     AstraDBIndexerConfig,
     AstraDBUploader,
@@ -105,10 +116,44 @@ def collection(upload_file: Path) -> Collection:
         astra_db.drop_collection(collection)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
+async def test_astra_search_source(
+    tmp_path: Path,
+):
+    env_data = get_env_data()
+    collection_name = "ingest_test_src"
+    connection_config = AstraDBConnectionConfig(
+        access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
+    )
+    indexer = AstraDBIndexer(
+        index_config=AstraDBIndexerConfig(
+            collection_name=collection_name,
+        ),
+        connection_config=connection_config,
+    )
+    downloader = AstraDBDownloader(
+        connection_config=connection_config,
+        download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
+    )
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id=CONNECTOR_TYPE,
+            expected_num_files=5,
+            expected_number_indexed_file_data=1,
+            validate_downloaded_files=True,
+        ),
+    )
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
 @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
-async def test_azure_ai_search_destination(
+async def test_astra_search_destination(
     upload_file: Path,
     collection: Collection,
     tmp_path: Path,
@@ -154,3 +199,19 @@ async def test_azure_ai_search_destination(
         f"Expected count ({expected_count}) doesn't match how "
         f"much came back from collection: {current_count}"
     )
+@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
+def test_astra_stager(
+    request: TopRequest,
+    upload_file_str: str,
+    tmp_path: Path,
+):
+    upload_file: Path = request.getfixturevalue(upload_file_str)
+    stager = AstraDBUploadStager()
+    stager_validation(
+        configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
+        input_file=upload_file,
+        stager=stager,
+        tmp_dir=tmp_path,
+    )

test/integration/connectors/test_azure_ai_search.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 from uuid import uuid4
 import pytest
+from _pytest.fixtures import TopRequest
 from azure.core.credentials import AzureKeyCredential
 from azure.search.documents import SearchClient
 from azure.search.documents.indexes import SearchIndexClient
@@ -25,6 +26,10 @@ from azure.search.documents.indexes.models import (
 from test.integration.connectors.utils.constants import (
     DESTINATION_TAG,
 )
+from test.integration.connectors.utils.validation.destination import (
+    StagerValidationConfigs,
+    stager_validation,
+)
 from test.integration.utils import requires_env
 from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
@@ -225,9 +230,26 @@ async def test_azure_ai_search_destination(
     with staged_filepath.open() as f:
         staged_elements = json.load(f)
     expected_count = len(staged_elements)
-    search_client: SearchClient = uploader.connection_config.get_search_client()
-    validate_count(search_client=search_client, expected_count=expected_count)
+    with uploader.connection_config.get_search_client() as search_client:
+        validate_count(search_client=search_client, expected_count=expected_count)
     # Rerun and make sure the same documents get updated
     uploader.run(path=staged_filepath, file_data=file_data)
-    validate_count(search_client=search_client, expected_count=expected_count)
+    with uploader.connection_config.get_search_client() as search_client:
+        validate_count(search_client=search_client, expected_count=expected_count)
+@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
+def test_azure_ai_search_stager(
+    request: TopRequest,
+    upload_file_str: str,
+    tmp_path: Path,
+):
+    upload_file: Path = request.getfixturevalue(upload_file_str)
+    stager = AzureAISearchUploadStager()
+    stager_validation(
+        configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
+        input_file=upload_file,
+        stager=stager,
+        tmp_dir=tmp_path,
+    )

unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl