PyPI - unstructured-ingest - Versions diffs - 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

unstructured-ingest 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (64) hide show

test/integration/connectors/test_neo4j.py ADDED Viewed

@@ -0,0 +1,236 @@
+import json
+import time
+import uuid
+from datetime import datetime
+from pathlib import Path
+import pytest
+from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
+from neo4j.exceptions import ServiceUnavailable
+from pytest_check import check
+from test.integration.connectors.utils.constants import DESTINATION_TAG
+from test.integration.connectors.utils.docker import container_context
+from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
+from unstructured_ingest.v2.interfaces.file_data import (
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
+from unstructured_ingest.v2.processes.connectors.neo4j import (
+    CONNECTOR_TYPE,
+    Label,
+    Neo4jAccessConfig,
+    Neo4jConnectionConfig,
+    Neo4jUploader,
+    Neo4jUploaderConfig,
+    Neo4jUploadStager,
+    Relationship,
+)
+USERNAME = "neo4j"
+PASSWORD = "password"
+URI = "neo4j://localhost:7687"
+DATABASE = "neo4j"
+EXPECTED_DOCUMENT_COUNT = 1
+# NOTE: Precheck tests are read-only so we utilize the same container for all tests.
+# If new tests require clean neo4j container, this fixture's scope should be adjusted.
+@pytest.fixture(autouse=True, scope="module")
+def _neo4j_server():
+    with container_context(
+        image="neo4j:latest", environment={"NEO4J_AUTH": "neo4j/password"}, ports={"7687": "7687"}
+    ):
+        driver = GraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
+        wait_for_connection(driver)
+        driver.close()
+        yield
+@pytest.mark.asyncio
+@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
+async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
+    stager = Neo4jUploadStager()
+    uploader = Neo4jUploader(
+        connection_config=Neo4jConnectionConfig(
+            access_config=Neo4jAccessConfig(password=PASSWORD),  # type: ignore
+            username=USERNAME,
+            uri=URI,
+            database=DATABASE,
+        ),
+        upload_config=Neo4jUploaderConfig(),
+    )
+    file_data = FileData(
+        identifier="mock-file-data",
+        connector_type="neo4j",
+        source_identifiers=SourceIdentifiers(
+            filename=upload_file.name,
+            fullpath=upload_file.name,
+        ),
+        metadata=FileDataSourceMetadata(
+            date_created=str(datetime(2022, 1, 1).timestamp()),
+            date_modified=str(datetime(2022, 1, 2).timestamp()),
+        ),
+    )
+    staged_filepath = stager.run(
+        upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    await uploader.run_async(staged_filepath, file_data)
+    await validate_uploaded_graph(upload_file)
+    modified_upload_file = tmp_path / f"modified-{upload_file.name}"
+    with open(upload_file) as file:
+        elements = json.load(file)
+        for element in elements:
+            element["element_id"] = str(uuid.uuid4())
+    with open(modified_upload_file, "w") as file:
+        json.dump(elements, file, indent=4)
+    staged_filepath = stager.run(
+        modified_upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=modified_upload_file.name,
+    )
+    await uploader.run_async(staged_filepath, file_data)
+    await validate_uploaded_graph(modified_upload_file)
+@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
+class TestPrecheck:
+    @pytest.fixture
+    def configured_uploader(self) -> Neo4jUploader:
+        return Neo4jUploader(
+            connection_config=Neo4jConnectionConfig(
+                access_config=Neo4jAccessConfig(password=PASSWORD),  # type: ignore
+                username=USERNAME,
+                uri=URI,
+                database=DATABASE,
+            ),
+            upload_config=Neo4jUploaderConfig(),
+        )
+    def test_succeeds(self, configured_uploader: Neo4jUploader):
+        configured_uploader.precheck()
+    def test_fails_on_invalid_password(self, configured_uploader: Neo4jUploader):
+        configured_uploader.connection_config.access_config.get_secret_value().password = (
+            "invalid-password"
+        )
+        with pytest.raises(
+            DestinationConnectionError,
+            match="{code: Neo.ClientError.Security.Unauthorized}",
+        ):
+            configured_uploader.precheck()
+    def test_fails_on_invalid_username(self, configured_uploader: Neo4jUploader):
+        configured_uploader.connection_config.username = "invalid-username"
+        with pytest.raises(
+            DestinationConnectionError, match="{code: Neo.ClientError.Security.Unauthorized}"
+        ):
+            configured_uploader.precheck()
+    @pytest.mark.parametrize(
+        ("uri", "expected_error_msg"),
+        [
+            ("neo4j://localhst:7687", "Cannot resolve address"),
+            ("neo4j://localhost:7777", "Unable to retrieve routing information"),
+        ],
+    )
+    def test_fails_on_invalid_uri(
+        self, configured_uploader: Neo4jUploader, uri: str, expected_error_msg: str
+    ):
+        configured_uploader.connection_config.uri = uri
+        with pytest.raises(DestinationConnectionError, match=expected_error_msg):
+            configured_uploader.precheck()
+    def test_fails_on_invalid_database(self, configured_uploader: Neo4jUploader):
+        configured_uploader.connection_config.database = "invalid-database"
+        with pytest.raises(
+            DestinationConnectionError, match="{code: Neo.ClientError.Database.DatabaseNotFound}"
+        ):
+            configured_uploader.precheck()
+def wait_for_connection(driver: Driver, retries: int = 10, delay_seconds: int = 2):
+    attempts = 0
+    while attempts < retries:
+        try:
+            driver.verify_connectivity()
+            return
+        except ServiceUnavailable:
+            time.sleep(delay_seconds)
+            attempts += 1
+    pytest.fail("Failed to connect with Neo4j server.")
+async def validate_uploaded_graph(upload_file: Path):
+    with open(upload_file) as file:
+        elements = json.load(file)
+    for element in elements:
+        if "orig_elements" in element["metadata"]:
+            element["metadata"]["orig_elements"] = elements_from_base64_gzipped_json(
+                element["metadata"]["orig_elements"]
+            )
+        else:
+            element["metadata"]["orig_elements"] = []
+    expected_chunks_count = len(elements)
+    expected_element_count = len(
+        {
+            origin_element["element_id"]
+            for chunk in elements
+            for origin_element in chunk["metadata"]["orig_elements"]
+        }
+    )
+    expected_nodes_count = expected_chunks_count + expected_element_count + EXPECTED_DOCUMENT_COUNT
+    driver = AsyncGraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
+    try:
+        nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
+        chunk_nodes_count = len(
+            (await driver.execute_query(f"MATCH (n: {Label.CHUNK}) RETURN n"))[0]
+        )
+        document_nodes_count = len(
+            (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT}) RETURN n"))[0]
+        )
+        element_nodes_count = len(
+            (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT}) RETURN n"))[0]
+        )
+        with check:
+            assert nodes_count == expected_nodes_count
+        with check:
+            assert document_nodes_count == EXPECTED_DOCUMENT_COUNT
+        with check:
+            assert chunk_nodes_count == expected_chunks_count
+        with check:
+            assert element_nodes_count == expected_element_count
+        records, _, _ = await driver.execute_query(
+            f"MATCH ()-[r:{Relationship.PART_OF_DOCUMENT}]->(:{Label.DOCUMENT}) RETURN r"
+        )
+        part_of_document_count = len(records)
+        records, _, _ = await driver.execute_query(
+            f"MATCH (:{Label.CHUNK})-[r:{Relationship.NEXT_CHUNK}]->(:{Label.CHUNK}) RETURN r"
+        )
+        next_chunk_count = len(records)
+        if not check.any_failures():
+            with check:
+                assert part_of_document_count == expected_chunks_count + expected_element_count
+            with check:
+                assert next_chunk_count == expected_chunks_count - 1
+    finally:
+        await driver.close()

test/integration/connectors/test_pinecone.py CHANGED Viewed

@@ -8,12 +8,17 @@ from typing import Generator
 from uuid import uuid4
 import pytest
+from _pytest.fixtures import TopRequest
 from pinecone import Pinecone, ServerlessSpec
 from pinecone.core.openapi.shared.exceptions import NotFoundException
 from test.integration.connectors.utils.constants import (
     DESTINATION_TAG,
 )
+from test.integration.connectors.utils.validation.destination import (
+    StagerValidationConfigs,
+    stager_validation,
+)
 from test.integration.utils import requires_env
 from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
@@ -251,7 +256,10 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
         identifier="mock-file-data",
     )
     staged_file = stager.run(
-        file_data, large_metadata_upload_file, tmp_path, large_metadata_upload_file.name
+        elements_filepath=large_metadata_upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=large_metadata_upload_file.name,
     )
     try:
         uploader.run(staged_file, file_data)
@@ -262,3 +270,19 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
         raise pytest.fail("Upload request failed due to metadata exceeding limits.")
     validate_pinecone_index(pinecone_index, 1, interval=5)
+@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
+def test_pinecone_stager(
+    request: TopRequest,
+    upload_file_str: str,
+    tmp_path: Path,
+):
+    upload_file: Path = request.getfixturevalue(upload_file_str)
+    stager = PineconeUploadStager()
+    stager_validation(
+        configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
+        input_file=upload_file,
+        stager=stager,
+        tmp_dir=tmp_path,
+    )

test/integration/connectors/test_qdrant.py CHANGED Viewed

@@ -6,10 +6,15 @@ from pathlib import Path
 from typing import AsyncGenerator
 import pytest
+from _pytest.fixtures import TopRequest
 from qdrant_client import AsyncQdrantClient
 from test.integration.connectors.utils.constants import DESTINATION_TAG
 from test.integration.connectors.utils.docker import container_context
+from test.integration.connectors.utils.validation.destination import (
+    StagerValidationConfigs,
+    stager_validation,
+)
 from test.integration.utils import requires_env
 from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.qdrant.cloud import (
@@ -138,7 +143,7 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
         output_dir=tmp_path,
         output_filename=upload_file.name,
     )
+    uploader.precheck()
     if uploader.is_async():
         await uploader.run_async(path=staged_upload_file, file_data=file_data)
     else:
@@ -183,10 +188,28 @@ async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
         output_dir=tmp_path,
         output_filename=upload_file.name,
     )
+    uploader.precheck()
     if uploader.is_async():
         await uploader.run_async(path=staged_upload_file, file_data=file_data)
     else:
         uploader.run(path=staged_upload_file, file_data=file_data)
     async with qdrant_client(connection_kwargs) as client:
         await validate_upload(client=client, upload_file=upload_file)
+@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
+def test_qdrant_stager(
+    request: TopRequest,
+    upload_file_str: str,
+    tmp_path: Path,
+):
+    upload_file: Path = request.getfixturevalue(upload_file_str)
+    stager = LocalQdrantUploadStager(
+        upload_stager_config=LocalQdrantUploadStagerConfig(),
+    )
+    stager_validation(
+        configs=StagerValidationConfigs(test_id=LOCAL_CONNECTOR_TYPE, expected_count=22),
+        input_file=upload_file,
+        stager=stager,
+        tmp_dir=tmp_path,
+    )

test/integration/connectors/test_s3.py CHANGED Viewed

@@ -11,8 +11,8 @@ from test.integration.connectors.utils.constants import (
     env_setup_path,
 )
 from test.integration.connectors.utils.docker_compose import docker_compose_context
-from test.integration.connectors.utils.validation import (
-    ValidationConfigs,
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
     source_connector_validation,
 )
 from test.integration.utils import requires_env
@@ -62,7 +62,7 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
         await source_connector_validation(
             indexer=indexer,
             downloader=downloader,
-            configs=ValidationConfigs(
+            configs=SourceValidationConfigs(
                 test_id="s3",
                 predownload_file_data_check=validate_predownload_file_data,
                 postdownload_file_data_check=validate_postdownload_file_data,
@@ -85,7 +85,7 @@ async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig
         await source_connector_validation(
             indexer=indexer,
             downloader=downloader,
-            configs=ValidationConfigs(
+            configs=SourceValidationConfigs(
                 test_id="s3-specialchar",
                 predownload_file_data_check=validate_predownload_file_data,
                 postdownload_file_data_check=validate_postdownload_file_data,
@@ -121,7 +121,7 @@ async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
             await source_connector_validation(
                 indexer=indexer,
                 downloader=downloader,
-                configs=ValidationConfigs(
+                configs=SourceValidationConfigs(
                     test_id="s3-minio",
                     predownload_file_data_check=validate_predownload_file_data,
                     postdownload_file_data_check=validate_postdownload_file_data,
@@ -165,11 +165,14 @@ async def test_s3_destination(upload_file: Path):
         identifier="mock file data",
     )
     try:
+        uploader.precheck()
         if uploader.is_async():
             await uploader.run_async(path=upload_file, file_data=file_data)
         else:
             uploader.run(path=upload_file, file_data=file_data)
-        uploaded_files = s3fs.ls(path=destination_path)
+        uploaded_files = [
+            Path(file) for file in s3fs.ls(path=destination_path) if Path(file).name != "_empty"
+        ]
         assert len(uploaded_files) == 1
     finally:
         s3fs.rm(path=destination_path, recursive=True)

test/integration/connectors/utils/docker.py CHANGED Viewed

@@ -44,6 +44,7 @@ def get_container(
     docker_client: docker.DockerClient,
     image: str,
     ports: dict,
+    name: Optional[str] = "connector_test",
     environment: Optional[dict] = None,
     volumes: Optional[dict] = None,
     healthcheck: Optional[HealthCheck] = None,
@@ -59,6 +60,8 @@ def get_container(
         run_kwargs["volumes"] = volumes
     if healthcheck:
         run_kwargs["healthcheck"] = healthcheck.model_dump()
+    if name:
+        run_kwargs["name"] = name
     container: Container = docker_client.containers.run(**run_kwargs)
     return container
@@ -112,6 +115,7 @@ def container_context(
     healthcheck: Optional[HealthCheck] = None,
     healthcheck_retries: int = 30,
     docker_client: Optional[docker.DockerClient] = None,
+    name: Optional[str] = "connector_test",
 ):
     docker_client = docker_client or docker.from_env()
     print(f"pulling image {image}")
@@ -125,6 +129,7 @@ def container_context(
             environment=environment,
             volumes=volumes,
             healthcheck=healthcheck,
+            name=name,
         )
         if healthcheck_data := get_healthcheck(container):
             # Mirror whatever healthcheck config set on container
@@ -143,3 +148,4 @@ def container_context(
     finally:
         if container:
             container.kill()
+            container.remove()

test/integration/connectors/utils/validation/__init__.py ADDED Viewed

File without changes

test/integration/connectors/utils/validation/destination.py ADDED Viewed

@@ -0,0 +1,88 @@
+import json
+import os
+import shutil
+from pathlib import Path
+import ndjson
+from test.integration.connectors.utils.validation.utils import ValidationConfig
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, UploadStager
+class StagerValidationConfigs(ValidationConfig):
+    expected_count: int
+    def stager_output_dir(self) -> Path:
+        dir = self.test_output_dir() / "stager"
+        dir.mkdir(exist_ok=True, parents=True)
+        return dir
+    def stager_output_path(self, input_path: Path) -> Path:
+        return self.stager_output_dir() / input_path.name
+def run_all_stager_validations(
+    configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
+):
+    # Validate matching extensions
+    assert input_file.suffix == staged_filepath.suffix
+    # Validate length
+    staged_data = get_data(staged_filepath=staged_filepath)
+    assert len(staged_data) == configs.expected_count
+    # Validate file
+    expected_filepath = configs.stager_output_path(input_path=input_file)
+    assert expected_filepath.exists(), f"{expected_filepath} does not exist"
+    assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
+    if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
+        raise AssertionError(
+            f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
+        )
+def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
+    copied_filepath = stager_output_path / staged_filepath.name
+    shutil.copy(staged_filepath, copied_filepath)
+def get_data(staged_filepath: Path) -> list[dict]:
+    if staged_filepath.suffix == ".json":
+        with staged_filepath.open() as f:
+            return json.load(f)
+    elif staged_filepath.suffix == ".ndjson":
+        with staged_filepath.open() as f:
+            return ndjson.load(f)
+    else:
+        raise ValueError(f"Unsupported file type: {staged_filepath.suffix}")
+def stager_validation(
+    stager: UploadStager,
+    tmp_dir: Path,
+    input_file: Path,
+    configs: StagerValidationConfigs,
+    overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
+) -> None:
+    # Run stager
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
+        connector_type=configs.test_id,
+        identifier="mock file data",
+    )
+    staged_filepath = stager.run(
+        elements_filepath=input_file,
+        file_data=file_data,
+        output_dir=tmp_dir,
+        output_filename=input_file.name,
+    )
+    if not overwrite_fixtures:
+        print("Running validation")
+        run_all_stager_validations(
+            configs=configs, input_file=input_file, staged_filepath=staged_filepath
+        )
+    else:
+        print("Running fixtures update")
+        update_stager_fixtures(
+            stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
+        )

test/integration/connectors/utils/validation/equality.py ADDED Viewed

@@ -0,0 +1,75 @@
+import json
+from pathlib import Path
+import ndjson
+from bs4 import BeautifulSoup
+from deepdiff import DeepDiff
+def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as f:
+        expected_data = json.load(f)
+    with current_filepath.open() as f:
+        current_data = json.load(f)
+    diff = DeepDiff(expected_data, current_data)
+    if diff:
+        print("diff between expected and current json")
+        print(diff.to_json(indent=2))
+        return False
+    return True
+def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as f:
+        expected_data = ndjson.load(f)
+    with current_filepath.open() as f:
+        current_data = ndjson.load(f)
+    if len(current_data) != len(expected_data):
+        print(
+            f"expected data length {len(expected_data)} "
+            f"didn't match current results: {len(current_data)}"
+        )
+    for i in range(len(expected_data)):
+        e = expected_data[i]
+        r = current_data[i]
+        if e != r:
+            print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
+            return False
+    return True
+def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_soup = BeautifulSoup(expected_f, "html.parser")
+    with current_filepath.open() as current_f:
+        current_soup = BeautifulSoup(current_f, "html.parser")
+    return expected_soup.text == current_soup.text
+def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_text_lines = expected_f.readlines()
+    with current_filepath.open() as current_f:
+        current_text_lines = current_f.readlines()
+    if len(expected_text_lines) != len(current_text_lines):
+        print(
+            f"Lines in expected text file ({len(expected_text_lines)}) "
+            f"don't match current text file ({len(current_text_lines)})"
+        )
+        return False
+    expected_text = "\n".join(expected_text_lines)
+    current_text = "\n".join(current_text_lines)
+    if expected_text == current_text:
+        return True
+    print("txt content don't match:")
+    print(f"expected: {expected_text}")
+    print(f"current: {current_text}")
+    return False
+file_type_equality_check = {
+    ".json": json_equality_check,
+    ".ndjson": ndjson_equality_check,
+    ".html": html_equality_check,
+    ".txt": txt_equality_check,
+}

unstructured-ingest 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl