PyPI - unstructured-ingest - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (93) hide show

test/integration/connectors/sql/test_snowflake.py CHANGED Viewed

@@ -86,7 +86,7 @@ async def test_snowflake_source():
         image="localstack/snowflake",
         environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
         ports={4566: 4566, 443: 443},
-        healthcheck_timeout=30,
+        healthcheck_retries=30,
     ):
         seed_data()
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -156,7 +156,7 @@ async def test_snowflake_destination(upload_file: Path):
         image="localstack/snowflake",
         environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
         ports={4566: 4566, 443: 443},
-        healthcheck_timeout=30,
+        healthcheck_retries=30,
     ):
         init_db_destination()
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -192,10 +192,8 @@ async def test_snowflake_destination(upload_file: Path):
                     host=connect_params["host"],
                 )
             )
-            if uploader.is_async():
-                await uploader.run_async(path=staged_path, file_data=mock_file_data)
-            else:
-                uploader.run(path=staged_path, file_data=mock_file_data)
+            uploader.run(path=staged_path, file_data=mock_file_data)
             staged_df = pd.read_json(staged_path, orient="records", lines=True)
             expected_num_elements = len(staged_df)
@@ -203,3 +201,9 @@ async def test_snowflake_destination(upload_file: Path):
                 connect_params=connect_params,
                 expected_num_elements=expected_num_elements,
             )
+            uploader.run(path=staged_path, file_data=mock_file_data)
+            validate_destination(
+                connect_params=connect_params,
+                expected_num_elements=expected_num_elements,
+            )

test/integration/connectors/sql/test_sqlite.py CHANGED Viewed

@@ -138,10 +138,10 @@ async def test_sqlite_destination(upload_file: Path):
             uploader = SQLiteUploader(
                 connection_config=SQLiteConnectionConfig(database_path=db_path)
             )
-            if uploader.is_async():
-                await uploader.run_async(path=staged_path, file_data=mock_file_data)
-            else:
-                uploader.run(path=staged_path, file_data=mock_file_data)
+            uploader.run(path=staged_path, file_data=mock_file_data)
             staged_df = pd.read_json(staged_path, orient="records", lines=True)
             validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
+            uploader.run(path=staged_path, file_data=mock_file_data)
+            validate_destination(db_path=db_path, expected_num_elements=len(staged_df))

test/integration/connectors/test_astradb.py ADDED Viewed

@@ -0,0 +1,156 @@
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from uuid import uuid4
+import pytest
+from astrapy import Collection
+from astrapy import DataAPIClient as AstraDBClient
+from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.astradb import (
+    CONNECTOR_TYPE,
+    AstraDBAccessConfig,
+    AstraDBConnectionConfig,
+    AstraDBIndexer,
+    AstraDBIndexerConfig,
+    AstraDBUploader,
+    AstraDBUploaderConfig,
+    AstraDBUploadStager,
+    DestinationConnectionError,
+    SourceConnectionError,
+)
+EXISTENT_COLLECTION_NAME = "ingest_test_src"
+NONEXISTENT_COLLECTION_NAME = "nonexistant"
+@pytest.fixture
+def connection_config() -> AstraDBConnectionConfig:
+    return AstraDBConnectionConfig(
+        access_config=AstraDBAccessConfig(
+            token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+            api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+        )
+    )
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
+@requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
+def test_precheck_succeeds(connection_config: AstraDBConnectionConfig):
+    indexer = AstraDBIndexer(
+        connection_config=connection_config,
+        index_config=AstraDBIndexerConfig(collection_name=EXISTENT_COLLECTION_NAME),
+    )
+    uploader = AstraDBUploader(
+        connection_config=connection_config,
+        upload_config=AstraDBUploaderConfig(collection_name=EXISTENT_COLLECTION_NAME),
+    )
+    indexer.precheck()
+    uploader.precheck()
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
+@requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
+def test_precheck_fails(connection_config: AstraDBConnectionConfig):
+    indexer = AstraDBIndexer(
+        connection_config=connection_config,
+        index_config=AstraDBIndexerConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
+    )
+    uploader = AstraDBUploader(
+        connection_config=connection_config,
+        upload_config=AstraDBUploaderConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
+    )
+    with pytest.raises(expected_exception=SourceConnectionError):
+        indexer.precheck()
+    with pytest.raises(expected_exception=DestinationConnectionError):
+        uploader.precheck()
+@dataclass(frozen=True)
+class EnvData:
+    api_endpoint: str
+    token: str
+def get_env_data() -> EnvData:
+    return EnvData(
+        api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+    )
+@pytest.fixture
+def collection(upload_file: Path) -> Collection:
+    random_id = str(uuid4())[:8]
+    collection_name = f"utic_test_{random_id}"
+    with upload_file.open("r") as upload_fp:
+        upload_data = json.load(upload_fp)
+    first_content = upload_data[0]
+    embeddings = first_content["embeddings"]
+    embedding_dimension = len(embeddings)
+    my_client = AstraDBClient()
+    env_data = get_env_data()
+    astra_db = my_client.get_database(
+        api_endpoint=env_data.api_endpoint,
+        token=env_data.token,
+    )
+    collection = astra_db.create_collection(collection_name, dimension=embedding_dimension)
+    try:
+        yield collection
+    finally:
+        astra_db.drop_collection(collection)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
+async def test_azure_ai_search_destination(
+    upload_file: Path,
+    collection: Collection,
+    tmp_path: Path,
+):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    stager = AstraDBUploadStager()
+    env_data = get_env_data()
+    uploader = AstraDBUploader(
+        connection_config=AstraDBConnectionConfig(
+            access_config=AstraDBAccessConfig(
+                api_endpoint=env_data.api_endpoint, token=env_data.token
+            ),
+        ),
+        upload_config=AstraDBUploaderConfig(collection_name=collection.name),
+    )
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    uploader.precheck()
+    uploader.run(path=staged_filepath, file_data=file_data)
+    # Run validation
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
+    assert current_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from collection: {current_count}"
+    )
+    # Rerun and make sure the same documents get updated
+    uploader.run(path=staged_filepath, file_data=file_data)
+    current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
+    assert current_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from collection: {current_count}"
+    )

test/integration/connectors/test_azure_cog_search.py ADDED Viewed

@@ -0,0 +1,233 @@
+import json
+import os
+import time
+from pathlib import Path
+from uuid import uuid4
+import pytest
+from azure.core.credentials import AzureKeyCredential
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
+    ComplexField,
+    CorsOptions,
+    HnswAlgorithmConfiguration,
+    HnswParameters,
+    SearchField,
+    SearchFieldDataType,
+    SearchIndex,
+    SimpleField,
+    VectorSearch,
+    VectorSearchAlgorithmMetric,
+    VectorSearchProfile,
+)
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
+    CONNECTOR_TYPE,
+    RECORD_ID_LABEL,
+    AzureAISearchAccessConfig,
+    AzureAISearchConnectionConfig,
+    AzureAISearchUploader,
+    AzureAISearchUploaderConfig,
+    AzureAISearchUploadStager,
+    AzureAISearchUploadStagerConfig,
+)
+repo_path = Path(__file__).parent.resolve()
+API_KEY = "AZURE_SEARCH_API_KEY"
+ENDPOINT = "https://ingest-test-azure-cognitive-search.search.windows.net"
+def get_api_key() -> str:
+    key = os.environ[API_KEY]
+    return key
+def get_fields() -> list:
+    data_source_fields = [
+        SimpleField(name="url", type=SearchFieldDataType.String),
+        SimpleField(name="version", type=SearchFieldDataType.String),
+        SimpleField(name="date_created", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="date_modified", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="date_processed", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="permissions_data", type=SearchFieldDataType.String),
+        SimpleField(name="record_locator", type=SearchFieldDataType.String),
+    ]
+    coordinates_fields = [
+        SimpleField(name="system", type=SearchFieldDataType.String),
+        SimpleField(name="layout_width", type=SearchFieldDataType.Double),
+        SimpleField(name="layout_height", type=SearchFieldDataType.Double),
+        SimpleField(name="points", type=SearchFieldDataType.String),
+    ]
+    metadata_fields = [
+        SimpleField(name="orig_elements", type=SearchFieldDataType.String),
+        SimpleField(name="category_depth", type=SearchFieldDataType.Int32),
+        SimpleField(name="parent_id", type=SearchFieldDataType.String),
+        SimpleField(name="attached_to_filename", type=SearchFieldDataType.String),
+        SimpleField(name="filetype", type=SearchFieldDataType.String),
+        SimpleField(name="last_modified", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="is_continuation", type=SearchFieldDataType.Boolean),
+        SimpleField(name="file_directory", type=SearchFieldDataType.String),
+        SimpleField(name="filename", type=SearchFieldDataType.String),
+        ComplexField(name="data_source", fields=data_source_fields),
+        ComplexField(name="coordinates", fields=coordinates_fields),
+        SimpleField(
+            name="languages", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(name="page_number", type=SearchFieldDataType.String),
+        SimpleField(name="links", type=SearchFieldDataType.Collection(SearchFieldDataType.String)),
+        SimpleField(name="page_name", type=SearchFieldDataType.String),
+        SimpleField(name="url", type=SearchFieldDataType.String),
+        SimpleField(
+            name="link_urls", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="link_texts", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="sent_from", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="sent_to", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(name="subject", type=SearchFieldDataType.String),
+        SimpleField(name="section", type=SearchFieldDataType.String),
+        SimpleField(name="header_footer_type", type=SearchFieldDataType.String),
+        SimpleField(
+            name="emphasized_text_contents",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+        ),
+        SimpleField(
+            name="emphasized_text_tags",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+        ),
+        SimpleField(name="text_as_html", type=SearchFieldDataType.String),
+        SimpleField(name="regex_metadata", type=SearchFieldDataType.String),
+        SimpleField(name="detection_class_prob", type=SearchFieldDataType.Double),
+    ]
+    fields = [
+        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
+        SimpleField(name=RECORD_ID_LABEL, type=SearchFieldDataType.String, filterable=True),
+        SimpleField(name="element_id", type=SearchFieldDataType.String),
+        SimpleField(name="text", type=SearchFieldDataType.String),
+        SimpleField(name="type", type=SearchFieldDataType.String),
+        ComplexField(name="metadata", fields=metadata_fields),
+        SearchField(
+            name="embeddings",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+            vector_search_dimensions=384,
+            vector_search_profile_name="embeddings-config-profile",
+        ),
+    ]
+    return fields
+def get_vector_search() -> VectorSearch:
+    return VectorSearch(
+        algorithms=[
+            HnswAlgorithmConfiguration(
+                name="hnsw-config",
+                parameters=HnswParameters(
+                    metric=VectorSearchAlgorithmMetric.COSINE,
+                ),
+            )
+        ],
+        profiles=[
+            VectorSearchProfile(
+                name="embeddings-config-profile", algorithm_configuration_name="hnsw-config"
+            )
+        ],
+    )
+def get_search_index_client() -> SearchIndexClient:
+    api_key = get_api_key()
+    return SearchIndexClient(ENDPOINT, AzureKeyCredential(api_key))
+@pytest.fixture
+def index() -> str:
+    random_id = str(uuid4())[:8]
+    index_name = f"utic-test-{random_id}"
+    client = get_search_index_client()
+    index = SearchIndex(
+        name=index_name,
+        fields=get_fields(),
+        vector_search=get_vector_search(),
+        cors_options=CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
+    )
+    print(f"creating index: {index_name}")
+    client.create_index(index=index)
+    try:
+        yield index_name
+    finally:
+        print(f"deleting index: {index_name}")
+        client.delete_index(index)
+def validate_count(
+    search_client: SearchClient, expected_count: int, retries: int = 10, interval: int = 1
+) -> None:
+    index_count = search_client.get_document_count()
+    if index_count == expected_count:
+        return
+    tries = 0
+    while tries < retries:
+        time.sleep(interval)
+        index_count = search_client.get_document_count()
+        if index_count == expected_count:
+            break
+    assert index_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from index: {index_count}"
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("AZURE_SEARCH_API_KEY")
+async def test_azure_ai_search_destination(
+    upload_file: Path,
+    index: str,
+    tmp_path: Path,
+):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    stager = AzureAISearchUploadStager(upload_stager_config=AzureAISearchUploadStagerConfig())
+    uploader = AzureAISearchUploader(
+        connection_config=AzureAISearchConnectionConfig(
+            access_config=AzureAISearchAccessConfig(key=get_api_key()),
+            endpoint=ENDPOINT,
+            index=index,
+        ),
+        upload_config=AzureAISearchUploaderConfig(),
+    )
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    uploader.precheck()
+    uploader.run(path=staged_filepath, file_data=file_data)
+    # Run validation
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    search_client: SearchClient = uploader.connection_config.get_search_client()
+    validate_count(search_client=search_client, expected_count=expected_count)
+    # Rerun and make sure the same documents get updated
+    uploader.run(path=staged_filepath, file_data=file_data)
+    validate_count(search_client=search_client, expected_count=expected_count)

test/integration/connectors/test_delta_table.py CHANGED Viewed

@@ -136,3 +136,49 @@ async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
             secret=aws_credentials["AWS_SECRET_ACCESS_KEY"],
         )
         s3fs.rm(path=destination_path, recursive=True)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
+async def test_delta_table_destination_s3_bad_creds(upload_file: Path, temp_dir: Path):
+    aws_credentials = {
+        "AWS_ACCESS_KEY_ID": "bad key",
+        "AWS_SECRET_ACCESS_KEY": "bad secret",
+        "AWS_REGION": "us-east-2",
+    }
+    s3_bucket = "s3://utic-platform-test-destination"
+    destination_path = f"{s3_bucket}/destination/test"
+    connection_config = DeltaTableConnectionConfig(
+        access_config=DeltaTableAccessConfig(
+            aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
+            aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
+        ),
+        aws_region=aws_credentials["AWS_REGION"],
+        table_uri=destination_path,
+    )
+    stager_config = DeltaTableUploadStagerConfig()
+    stager = DeltaTableUploadStager(upload_stager_config=stager_config)
+    new_upload_file = stager.run(
+        elements_filepath=upload_file,
+        output_dir=temp_dir,
+        output_filename=upload_file.name,
+    )
+    upload_config = DeltaTableUploaderConfig()
+    uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(
+            fullpath=upload_file.name, filename=new_upload_file.name
+        ),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    with pytest.raises(Exception) as excinfo:
+        if uploader.is_async():
+            await uploader.run_async(path=new_upload_file, file_data=file_data)
+        else:
+            uploader.run(path=new_upload_file, file_data=file_data)
+    assert "403 Forbidden" in str(excinfo.value), f"Exception message did not match: {str(excinfo)}"

unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl