PyPI - unstructured-ingest - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

unstructured-ingest 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show

test/integration/connectors/test_astradb.py ADDED Viewed

@@ -0,0 +1,109 @@
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from uuid import uuid4
+import pytest
+from astrapy import Collection
+from astrapy import DataAPIClient as AstraDBClient
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.astradb import (
+    CONNECTOR_TYPE,
+    AstraDBAccessConfig,
+    AstraDBConnectionConfig,
+    AstraDBUploader,
+    AstraDBUploaderConfig,
+    AstraDBUploadStager,
+)
+@dataclass(frozen=True)
+class EnvData:
+    api_endpoint: str
+    token: str
+def get_env_data() -> EnvData:
+    return EnvData(
+        api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+    )
+@pytest.fixture
+def collection(upload_file: Path) -> Collection:
+    random_id = str(uuid4())[:8]
+    collection_name = f"utic_test_{random_id}"
+    with upload_file.open("r") as upload_fp:
+        upload_data = json.load(upload_fp)
+    first_content = upload_data[0]
+    embeddings = first_content["embeddings"]
+    embedding_dimension = len(embeddings)
+    my_client = AstraDBClient()
+    env_data = get_env_data()
+    astra_db = my_client.get_database(
+        api_endpoint=env_data.api_endpoint,
+        token=env_data.token,
+    )
+    collection = astra_db.create_collection(collection_name, dimension=embedding_dimension)
+    try:
+        yield collection
+    finally:
+        astra_db.drop_collection(collection)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
+async def test_azure_ai_search_destination(
+    upload_file: Path,
+    collection: Collection,
+    tmp_path: Path,
+):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    stager = AstraDBUploadStager()
+    env_data = get_env_data()
+    uploader = AstraDBUploader(
+        connection_config=AstraDBConnectionConfig(
+            access_config=AstraDBAccessConfig(
+                api_endpoint=env_data.api_endpoint, token=env_data.token
+            ),
+        ),
+        upload_config=AstraDBUploaderConfig(collection_name=collection.name),
+    )
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    uploader.precheck()
+    uploader.run(path=staged_filepath, file_data=file_data)
+    # Run validation
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
+    assert current_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from collection: {current_count}"
+    )
+    # Rerun and make sure the same documents get updated
+    uploader.run(path=staged_filepath, file_data=file_data)
+    current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
+    assert current_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from collection: {current_count}"
+    )

test/integration/connectors/test_azure_cog_search.py ADDED Viewed

@@ -0,0 +1,233 @@
+import json
+import os
+import time
+from pathlib import Path
+from uuid import uuid4
+import pytest
+from azure.core.credentials import AzureKeyCredential
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
+    ComplexField,
+    CorsOptions,
+    HnswAlgorithmConfiguration,
+    HnswParameters,
+    SearchField,
+    SearchFieldDataType,
+    SearchIndex,
+    SimpleField,
+    VectorSearch,
+    VectorSearchAlgorithmMetric,
+    VectorSearchProfile,
+)
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
+    CONNECTOR_TYPE,
+    RECORD_ID_LABEL,
+    AzureAISearchAccessConfig,
+    AzureAISearchConnectionConfig,
+    AzureAISearchUploader,
+    AzureAISearchUploaderConfig,
+    AzureAISearchUploadStager,
+    AzureAISearchUploadStagerConfig,
+)
+repo_path = Path(__file__).parent.resolve()
+API_KEY = "AZURE_SEARCH_API_KEY"
+ENDPOINT = "https://ingest-test-azure-cognitive-search.search.windows.net"
+def get_api_key() -> str:
+    key = os.environ[API_KEY]
+    return key
+def get_fields() -> list:
+    data_source_fields = [
+        SimpleField(name="url", type=SearchFieldDataType.String),
+        SimpleField(name="version", type=SearchFieldDataType.String),
+        SimpleField(name="date_created", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="date_modified", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="date_processed", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="permissions_data", type=SearchFieldDataType.String),
+        SimpleField(name="record_locator", type=SearchFieldDataType.String),
+    ]
+    coordinates_fields = [
+        SimpleField(name="system", type=SearchFieldDataType.String),
+        SimpleField(name="layout_width", type=SearchFieldDataType.Double),
+        SimpleField(name="layout_height", type=SearchFieldDataType.Double),
+        SimpleField(name="points", type=SearchFieldDataType.String),
+    ]
+    metadata_fields = [
+        SimpleField(name="orig_elements", type=SearchFieldDataType.String),
+        SimpleField(name="category_depth", type=SearchFieldDataType.Int32),
+        SimpleField(name="parent_id", type=SearchFieldDataType.String),
+        SimpleField(name="attached_to_filename", type=SearchFieldDataType.String),
+        SimpleField(name="filetype", type=SearchFieldDataType.String),
+        SimpleField(name="last_modified", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="is_continuation", type=SearchFieldDataType.Boolean),
+        SimpleField(name="file_directory", type=SearchFieldDataType.String),
+        SimpleField(name="filename", type=SearchFieldDataType.String),
+        ComplexField(name="data_source", fields=data_source_fields),
+        ComplexField(name="coordinates", fields=coordinates_fields),
+        SimpleField(
+            name="languages", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(name="page_number", type=SearchFieldDataType.String),
+        SimpleField(name="links", type=SearchFieldDataType.Collection(SearchFieldDataType.String)),
+        SimpleField(name="page_name", type=SearchFieldDataType.String),
+        SimpleField(name="url", type=SearchFieldDataType.String),
+        SimpleField(
+            name="link_urls", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="link_texts", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="sent_from", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="sent_to", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(name="subject", type=SearchFieldDataType.String),
+        SimpleField(name="section", type=SearchFieldDataType.String),
+        SimpleField(name="header_footer_type", type=SearchFieldDataType.String),
+        SimpleField(
+            name="emphasized_text_contents",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+        ),
+        SimpleField(
+            name="emphasized_text_tags",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+        ),
+        SimpleField(name="text_as_html", type=SearchFieldDataType.String),
+        SimpleField(name="regex_metadata", type=SearchFieldDataType.String),
+        SimpleField(name="detection_class_prob", type=SearchFieldDataType.Double),
+    ]
+    fields = [
+        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
+        SimpleField(name=RECORD_ID_LABEL, type=SearchFieldDataType.String, filterable=True),
+        SimpleField(name="element_id", type=SearchFieldDataType.String),
+        SimpleField(name="text", type=SearchFieldDataType.String),
+        SimpleField(name="type", type=SearchFieldDataType.String),
+        ComplexField(name="metadata", fields=metadata_fields),
+        SearchField(
+            name="embeddings",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+            vector_search_dimensions=384,
+            vector_search_profile_name="embeddings-config-profile",
+        ),
+    ]
+    return fields
+def get_vector_search() -> VectorSearch:
+    return VectorSearch(
+        algorithms=[
+            HnswAlgorithmConfiguration(
+                name="hnsw-config",
+                parameters=HnswParameters(
+                    metric=VectorSearchAlgorithmMetric.COSINE,
+                ),
+            )
+        ],
+        profiles=[
+            VectorSearchProfile(
+                name="embeddings-config-profile", algorithm_configuration_name="hnsw-config"
+            )
+        ],
+    )
+def get_search_index_client() -> SearchIndexClient:
+    api_key = get_api_key()
+    return SearchIndexClient(ENDPOINT, AzureKeyCredential(api_key))
+@pytest.fixture
+def index() -> str:
+    random_id = str(uuid4())[:8]
+    index_name = f"utic-test-{random_id}"
+    client = get_search_index_client()
+    index = SearchIndex(
+        name=index_name,
+        fields=get_fields(),
+        vector_search=get_vector_search(),
+        cors_options=CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
+    )
+    print(f"creating index: {index_name}")
+    client.create_index(index=index)
+    try:
+        yield index_name
+    finally:
+        print(f"deleting index: {index_name}")
+        client.delete_index(index)
+def validate_count(
+    search_client: SearchClient, expected_count: int, retries: int = 10, interval: int = 1
+) -> None:
+    index_count = search_client.get_document_count()
+    if index_count == expected_count:
+        return
+    tries = 0
+    while tries < retries:
+        time.sleep(interval)
+        index_count = search_client.get_document_count()
+        if index_count == expected_count:
+            break
+    assert index_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from index: {index_count}"
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("AZURE_SEARCH_API_KEY")
+async def test_azure_ai_search_destination(
+    upload_file: Path,
+    index: str,
+    tmp_path: Path,
+):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    stager = AzureAISearchUploadStager(upload_stager_config=AzureAISearchUploadStagerConfig())
+    uploader = AzureAISearchUploader(
+        connection_config=AzureAISearchConnectionConfig(
+            access_config=AzureAISearchAccessConfig(key=get_api_key()),
+            endpoint=ENDPOINT,
+            index=index,
+        ),
+        upload_config=AzureAISearchUploaderConfig(),
+    )
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    uploader.precheck()
+    uploader.run(path=staged_filepath, file_data=file_data)
+    # Run validation
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    search_client: SearchClient = uploader.connection_config.get_search_client()
+    validate_count(search_client=search_client, expected_count=expected_count)
+    # Rerun and make sure the same documents get updated
+    uploader.run(path=staged_filepath, file_data=file_data)
+    validate_count(search_client=search_client, expected_count=expected_count)

test/integration/connectors/test_confluence.py ADDED Viewed

@@ -0,0 +1,113 @@
+import os
+import pytest
+from test.integration.connectors.utils.constants import (
+    SOURCE_TAG,
+)
+from test.integration.connectors.utils.validation import (
+    ValidationConfigs,
+    source_connector_validation,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.processes.connectors.confluence import (
+    CONNECTOR_TYPE,
+    ConfluenceAccessConfig,
+    ConfluenceConnectionConfig,
+    ConfluenceDownloader,
+    ConfluenceDownloaderConfig,
+    ConfluenceIndexer,
+    ConfluenceIndexerConfig,
+)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
+async def test_confluence_source(temp_dir):
+    # Retrieve environment variables
+    confluence_url = "https://unstructured-ingest-test.atlassian.net"
+    user_email = os.environ["CONFLUENCE_USER_EMAIL"]
+    api_token = os.environ["CONFLUENCE_API_TOKEN"]
+    spaces = ["testteamsp", "MFS"]
+    # Create connection and indexer configurations
+    access_config = ConfluenceAccessConfig(api_token=api_token)
+    connection_config = ConfluenceConnectionConfig(
+        url=confluence_url,
+        user_email=user_email,
+        access_config=access_config,
+    )
+    index_config = ConfluenceIndexerConfig(
+        max_num_of_spaces=500,
+        max_num_of_docs_from_each_space=100,
+        spaces=spaces,
+    )
+    download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = ConfluenceIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = ConfluenceDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=ValidationConfigs(
+            test_id="confluence",
+            expected_num_files=11,
+            validate_downloaded_files=True,
+        ),
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
+async def test_confluence_source_large(temp_dir):
+    # Retrieve environment variables
+    confluence_url = "https://unstructured-ingest-test.atlassian.net"
+    user_email = os.environ["CONFLUENCE_USER_EMAIL"]
+    api_token = os.environ["CONFLUENCE_API_TOKEN"]
+    spaces = ["testteamsp1"]
+    # Create connection and indexer configurations
+    access_config = ConfluenceAccessConfig(api_token=api_token)
+    connection_config = ConfluenceConnectionConfig(
+        url=confluence_url,
+        user_email=user_email,
+        access_config=access_config,
+    )
+    index_config = ConfluenceIndexerConfig(
+        max_num_of_spaces=10,
+        max_num_of_docs_from_each_space=250,
+        spaces=spaces,
+    )
+    download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = ConfluenceIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = ConfluenceDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=ValidationConfigs(
+            test_id="confluence_large", expected_num_files=250, validate_file_data=False
+        ),
+    )

test/integration/connectors/test_kafka.py ADDED Viewed

@@ -0,0 +1,167 @@
+import json
+import tempfile
+from pathlib import Path
+import pytest
+from confluent_kafka import Consumer, KafkaError, KafkaException, Producer
+from confluent_kafka.admin import AdminClient, NewTopic
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+    SOURCE_TAG,
+    env_setup_path,
+)
+from test.integration.connectors.utils.docker_compose import docker_compose_context
+from test.integration.connectors.utils.validation import (
+    ValidationConfigs,
+    source_connector_validation,
+)
+from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.kafka.local import (
+    CONNECTOR_TYPE,
+    LocalKafkaConnectionConfig,
+    LocalKafkaDownloader,
+    LocalKafkaDownloaderConfig,
+    LocalKafkaIndexer,
+    LocalKafkaIndexerConfig,
+    LocalKafkaUploader,
+    LocalKafkaUploaderConfig,
+)
+SEED_MESSAGES = 10
+TOPIC = "fake-topic"
+@pytest.fixture
+def docker_compose_ctx():
+    with docker_compose_context(docker_compose_path=env_setup_path / "kafka") as ctx:
+        yield ctx
+@pytest.fixture
+def kafka_seed_topic(docker_compose_ctx) -> str:
+    conf = {
+        "bootstrap.servers": "localhost:29092",
+    }
+    producer = Producer(conf)
+    for i in range(SEED_MESSAGES):
+        message = f"This is some text for message {i}"
+        producer.produce(topic=TOPIC, value=message)
+    producer.flush(timeout=10)
+    print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
+    return TOPIC
+@pytest.fixture
+def kafka_upload_topic(docker_compose_ctx) -> str:
+    conf = {
+        "bootstrap.servers": "localhost:29092",
+    }
+    admin_client = AdminClient(conf)
+    admin_client.create_topics([NewTopic(TOPIC, 1, 1)])
+    return TOPIC
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+async def test_kafka_source_local(kafka_seed_topic: str):
+    connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
+    with tempfile.TemporaryDirectory() as tempdir:
+        tempdir_path = Path(tempdir)
+        download_config = LocalKafkaDownloaderConfig(download_dir=tempdir_path)
+        indexer = LocalKafkaIndexer(
+            connection_config=connection_config,
+            index_config=LocalKafkaIndexerConfig(topic=kafka_seed_topic, num_messages_to_consume=5),
+        )
+        downloader = LocalKafkaDownloader(
+            connection_config=connection_config, download_config=download_config
+        )
+        indexer.precheck()
+        await source_connector_validation(
+            indexer=indexer,
+            downloader=downloader,
+            configs=ValidationConfigs(
+                test_id="kafka", expected_num_files=5, validate_downloaded_files=True
+            ),
+        )
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+def test_kafak_source_local_precheck_fail():
+    connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
+    indexer = LocalKafkaIndexer(
+        connection_config=connection_config,
+        index_config=LocalKafkaIndexerConfig(topic=TOPIC, num_messages_to_consume=5),
+    )
+    with pytest.raises(SourceConnectionError):
+        indexer.precheck()
+def get_all_messages(topic: str, max_empty_messages: int = 5) -> list[dict]:
+    conf = {
+        "bootstrap.servers": "localhost:29092",
+        "group.id": "default_group_id",
+        "enable.auto.commit": "false",
+        "auto.offset.reset": "earliest",
+    }
+    consumer = Consumer(conf)
+    consumer.subscribe([topic])
+    messages = []
+    try:
+        empty_count = 0
+        while empty_count < max_empty_messages:
+            msg = consumer.poll(timeout=1)
+            if msg is None:
+                empty_count += 1
+                continue
+            if msg.error():
+                if msg.error().code() == KafkaError._PARTITION_EOF:
+                    break
+                else:
+                    raise KafkaException(msg.error())
+            try:
+                message = json.loads(msg.value().decode("utf8"))
+                messages.append(message)
+            finally:
+                consumer.commit(asynchronous=False)
+    finally:
+        print("closing consumer")
+        consumer.close()
+    return messages
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+async def test_kafka_destination_local(upload_file: Path, kafka_upload_topic: str):
+    uploader = LocalKafkaUploader(
+        connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
+        upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
+    )
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    uploader.precheck()
+    if uploader.is_async():
+        await uploader.run_async(path=upload_file, file_data=file_data)
+    else:
+        uploader.run(path=upload_file, file_data=file_data)
+    all_messages = get_all_messages(topic=kafka_upload_topic)
+    with upload_file.open("r") as upload_fs:
+        content_to_upload = json.load(upload_fs)
+    assert len(all_messages) == len(content_to_upload), (
+        f"expected number of messages ({len(content_to_upload)}) doesn't "
+        f"match how many messages read off of kakfa topic {kafka_upload_topic}: {len(all_messages)}"
+    )
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+def test_kafak_destination_local_precheck_fail():
+    uploader = LocalKafkaUploader(
+        connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
+        upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
+    )
+    with pytest.raises(DestinationConnectionError):
+        uploader.precheck()

unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl