PyPI - unstructured-ingest - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

unstructured-ingest 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (35) hide show

test/integration/connectors/test_confluence.py ADDED Viewed

@@ -0,0 +1,113 @@
+import os
+import pytest
+from test.integration.connectors.utils.constants import (
+    SOURCE_TAG,
+)
+from test.integration.connectors.utils.validation import (
+    ValidationConfigs,
+    source_connector_validation,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.processes.connectors.confluence import (
+    CONNECTOR_TYPE,
+    ConfluenceAccessConfig,
+    ConfluenceConnectionConfig,
+    ConfluenceDownloader,
+    ConfluenceDownloaderConfig,
+    ConfluenceIndexer,
+    ConfluenceIndexerConfig,
+)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
+async def test_confluence_source(temp_dir):
+    # Retrieve environment variables
+    confluence_url = "https://unstructured-ingest-test.atlassian.net"
+    user_email = os.environ["CONFLUENCE_USER_EMAIL"]
+    api_token = os.environ["CONFLUENCE_API_TOKEN"]
+    spaces = ["testteamsp", "MFS"]
+    # Create connection and indexer configurations
+    access_config = ConfluenceAccessConfig(api_token=api_token)
+    connection_config = ConfluenceConnectionConfig(
+        url=confluence_url,
+        user_email=user_email,
+        access_config=access_config,
+    )
+    index_config = ConfluenceIndexerConfig(
+        max_num_of_spaces=500,
+        max_num_of_docs_from_each_space=100,
+        spaces=spaces,
+    )
+    download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = ConfluenceIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = ConfluenceDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=ValidationConfigs(
+            test_id="confluence",
+            expected_num_files=11,
+            validate_downloaded_files=True,
+        ),
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
+async def test_confluence_source_large(temp_dir):
+    # Retrieve environment variables
+    confluence_url = "https://unstructured-ingest-test.atlassian.net"
+    user_email = os.environ["CONFLUENCE_USER_EMAIL"]
+    api_token = os.environ["CONFLUENCE_API_TOKEN"]
+    spaces = ["testteamsp1"]
+    # Create connection and indexer configurations
+    access_config = ConfluenceAccessConfig(api_token=api_token)
+    connection_config = ConfluenceConnectionConfig(
+        url=confluence_url,
+        user_email=user_email,
+        access_config=access_config,
+    )
+    index_config = ConfluenceIndexerConfig(
+        max_num_of_spaces=10,
+        max_num_of_docs_from_each_space=250,
+        spaces=spaces,
+    )
+    download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = ConfluenceIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = ConfluenceDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=ValidationConfigs(
+            test_id="confluence_large", expected_num_files=250, validate_file_data=False
+        ),
+    )

test/integration/connectors/test_kafka.py ADDED Viewed

@@ -0,0 +1,67 @@
+import socket
+import tempfile
+from pathlib import Path
+import pytest
+from confluent_kafka import Producer
+from test.integration.connectors.utils.constants import (
+    SOURCE_TAG,
+    env_setup_path,
+)
+from test.integration.connectors.utils.docker_compose import docker_compose_context
+from test.integration.connectors.utils.validation import (
+    ValidationConfigs,
+    source_connector_validation,
+)
+from unstructured_ingest.v2.processes.connectors.kafka.local import (
+    CONNECTOR_TYPE,
+    LocalKafkaConnectionConfig,
+    LocalKafkaDownloader,
+    LocalKafkaDownloaderConfig,
+    LocalKafkaIndexer,
+    LocalKafkaIndexerConfig,
+)
+SEED_MESSAGES = 10
+TOPIC = "fake-topic"
+@pytest.fixture
+def kafka_seed_topic() -> str:
+    with docker_compose_context(docker_compose_path=env_setup_path / "kafka"):
+        conf = {
+            "bootstrap.servers": "localhost:29092",
+            "client.id": socket.gethostname(),
+            "message.max.bytes": 10485760,
+        }
+        producer = Producer(conf)
+        for i in range(SEED_MESSAGES):
+            message = f"This is some text for message {i}"
+            producer.produce(topic=TOPIC, value=message)
+        producer.flush(timeout=10)
+        print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
+        yield TOPIC
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+async def test_kafka_source_local(kafka_seed_topic: str):
+    connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
+    with tempfile.TemporaryDirectory() as tempdir:
+        tempdir_path = Path(tempdir)
+        download_config = LocalKafkaDownloaderConfig(download_dir=tempdir_path)
+        indexer = LocalKafkaIndexer(
+            connection_config=connection_config,
+            index_config=LocalKafkaIndexerConfig(topic=kafka_seed_topic, num_messages_to_consume=5),
+        )
+        downloader = LocalKafkaDownloader(
+            connection_config=connection_config, download_config=download_config
+        )
+        await source_connector_validation(
+            indexer=indexer,
+            downloader=downloader,
+            configs=ValidationConfigs(
+                test_id="kafka", expected_num_files=5, validate_downloaded_files=True
+            ),
+        )

test/integration/connectors/test_onedrive.py ADDED Viewed

@@ -0,0 +1,112 @@
+import os
+import uuid
+from pathlib import Path
+import pytest
+from office365.graph_client import GraphClient
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.onedrive import (
+    CONNECTOR_TYPE,
+    OnedriveAccessConfig,
+    OnedriveConnectionConfig,
+    OnedriveUploader,
+    OnedriveUploaderConfig,
+)
+@pytest.fixture
+def onedrive_test_folder() -> str:
+    """
+    Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
+    """
+    connection_config = get_connection_config()
+    user_pname = connection_config.user_pname
+    # Get the OneDrive client
+    client: GraphClient = connection_config.get_client()
+    drive = client.users[user_pname].drive
+    # Generate a unique test folder path
+    test_folder_path = f"utic-test-output-{uuid.uuid4()}"
+    # Create the test folder
+    root = drive.root
+    folder = root.create_folder(test_folder_path).execute_query()
+    print(f"created folder: {folder.name}")
+    try:
+        yield test_folder_path
+    finally:
+        # Teardown: delete the test folder and its contents
+        folder.delete_object().execute_query()
+        print(f"successfully deleted folder: {folder.name}")
+def get_connection_config():
+    """
+    Pytest fixture that provides the OnedriveConnectionConfig for tests.
+    """
+    client_id = os.getenv("MS_CLIENT_ID")
+    client_secret = os.getenv("MS_CLIENT_CRED")
+    tenant_id = os.getenv("MS_TENANT_ID")
+    user_pname = os.getenv("MS_USER_PNAME")
+    connection_config = OnedriveConnectionConfig(
+        client_id=client_id,
+        tenant=tenant_id,
+        user_pname=user_pname,
+        access_config=OnedriveAccessConfig(client_cred=client_secret),
+    )
+    return connection_config
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
+def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
+    """
+    Integration test for the OneDrive destination connector.
+    This test uploads a file to OneDrive and verifies that it exists.
+    """
+    connection_config = get_connection_config()
+    # Retrieve user principal name from the connection config
+    user_pname = connection_config.user_pname
+    # The test folder is provided by the fixture
+    destination_folder = onedrive_test_folder
+    destination_fullpath = f"{destination_folder}/{upload_file.name}"
+    # Configure the uploader with remote_url
+    upload_config = OnedriveUploaderConfig(remote_url=f"onedrive://{destination_folder}")
+    uploader = OnedriveUploader(
+        connection_config=connection_config,
+        upload_config=upload_config,
+    )
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(
+            fullpath=destination_fullpath,
+            filename=upload_file.name,
+        ),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock_file_data",
+    )
+    uploader.precheck()
+    uploader.run(path=upload_file, file_data=file_data)
+    # Verify that the file was uploaded
+    client = connection_config.get_client()
+    drive = client.users[user_pname].drive
+    uploaded_file = (
+        drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
+    )
+    # Check if the file exists
+    assert uploaded_file is not None
+    assert uploaded_file.name == upload_file.name

test/integration/connectors/test_qdrant.py ADDED Viewed

@@ -0,0 +1,137 @@
+import json
+import uuid
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import AsyncGenerator
+import pytest
+from qdrant_client import AsyncQdrantClient
+from test.integration.connectors.utils.constants import DESTINATION_TAG
+from test.integration.connectors.utils.docker import container_context
+from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.qdrant.local import (
+    CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
+)
+from unstructured_ingest.v2.processes.connectors.qdrant.local import (
+    LocalQdrantConnectionConfig,
+    LocalQdrantUploader,
+    LocalQdrantUploaderConfig,
+    LocalQdrantUploadStager,
+    LocalQdrantUploadStagerConfig,
+)
+from unstructured_ingest.v2.processes.connectors.qdrant.server import (
+    CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE,
+)
+from unstructured_ingest.v2.processes.connectors.qdrant.server import (
+    ServerQdrantConnectionConfig,
+    ServerQdrantUploader,
+    ServerQdrantUploaderConfig,
+    ServerQdrantUploadStager,
+    ServerQdrantUploadStagerConfig,
+)
+COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
+VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
+@asynccontextmanager
+async def qdrant_client(client_params: dict) -> AsyncGenerator[AsyncQdrantClient, None]:
+    client = AsyncQdrantClient(**client_params)
+    try:
+        yield client
+    finally:
+        await client.close()
+async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
+    with upload_file.open() as upload_fp:
+        elements = json.load(upload_fp)
+    expected_point_count = len(elements)
+    first_element = elements[0]
+    expected_text = first_element["text"]
+    embeddings = first_element["embeddings"]
+    collection = await client.get_collection(COLLECTION_NAME)
+    assert collection.points_count == expected_point_count
+    response = await client.query_points(COLLECTION_NAME, query=embeddings, limit=1)
+    assert response.points[0].payload is not None
+    assert response.points[0].payload["text"] == expected_text
+@pytest.mark.asyncio
+@pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
+async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
+    connection_kwargs = {"path": str(tmp_path / "qdrant")}
+    async with qdrant_client(connection_kwargs) as client:
+        await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
+    AsyncQdrantClient(**connection_kwargs)
+    stager = LocalQdrantUploadStager(
+        upload_stager_config=LocalQdrantUploadStagerConfig(),
+    )
+    uploader = LocalQdrantUploader(
+        connection_config=LocalQdrantConnectionConfig(**connection_kwargs),
+        upload_config=LocalQdrantUploaderConfig(collection_name=COLLECTION_NAME),
+    )
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=LOCAL_CONNECTOR_TYPE,
+        identifier="mock-file-data",
+    )
+    staged_upload_file = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    if uploader.is_async():
+        await uploader.run_async(path=staged_upload_file, file_data=file_data)
+    else:
+        uploader.run(path=upload_file, file_data=file_data)
+    async with qdrant_client(connection_kwargs) as client:
+        await validate_upload(client=client, upload_file=upload_file)
+@pytest.fixture
+def docker_context():
+    with container_context(image="qdrant/qdrant:latest", ports={"6333": "6333"}) as container:
+        yield container
+@pytest.mark.asyncio
+@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
+async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
+    connection_kwargs = {"location": "http://localhost:6333"}
+    async with qdrant_client(connection_kwargs) as client:
+        await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
+    AsyncQdrantClient(**connection_kwargs)
+    stager = ServerQdrantUploadStager(
+        upload_stager_config=ServerQdrantUploadStagerConfig(),
+    )
+    uploader = ServerQdrantUploader(
+        connection_config=ServerQdrantConnectionConfig(**connection_kwargs),
+        upload_config=ServerQdrantUploaderConfig(collection_name=COLLECTION_NAME),
+    )
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=SERVER_CONNECTOR_TYPE,
+        identifier="mock-file-data",
+    )
+    staged_upload_file = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    if uploader.is_async():
+        await uploader.run_async(path=staged_upload_file, file_data=file_data)
+    else:
+        uploader.run(path=upload_file, file_data=file_data)
+    async with qdrant_client(connection_kwargs) as client:
+        await validate_upload(client=client, upload_file=upload_file)

test/integration/connectors/utils/docker.py CHANGED Viewed

@@ -47,14 +47,15 @@ def healthcheck_wait(container: Container, timeout: int = 10) -> None:
 @contextmanager
 def container_context(
-    docker_client: docker.DockerClient,
     image: str,
     ports: dict,
     environment: Optional[dict] = None,
     volumes: Optional[dict] = None,
     healthcheck: Optional[dict] = None,
     healthcheck_timeout: int = 10,
+    docker_client: Optional[docker.DockerClient] = None,
 ):
+    docker_client = docker_client or docker.from_env()
     container: Optional[Container] = None
     try:
         container = get_container(

test/integration/connectors/utils/validation.py CHANGED Viewed

@@ -7,13 +7,14 @@ from pathlib import Path
 from typing import Callable, Optional
 import pandas as pd
+from bs4 import BeautifulSoup
 from deepdiff import DeepDiff
 from test.integration.connectors.utils.constants import expected_results_path
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
-def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
     expected_df = pd.read_csv(expected_filepath)
     current_df = pd.read_csv(current_filepath)
     if expected_df.equals(current_df):
@@ -27,6 +28,42 @@ def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) ->
     return False
+def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_soup = BeautifulSoup(expected_f, "html.parser")
+    with current_filepath.open() as current_f:
+        current_soup = BeautifulSoup(current_f, "html.parser")
+    return expected_soup.text == current_soup.text
+def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_text_lines = expected_f.readlines()
+    with current_filepath.open() as current_f:
+        current_text_lines = current_f.readlines()
+    if len(expected_text_lines) != len(current_text_lines):
+        print(
+            f"Lines in expected text file ({len(expected_text_lines)}) "
+            f"don't match current text file ({len(current_text_lines)})"
+        )
+        return False
+    expected_text = "\n".join(expected_text_lines)
+    current_text = "\n".join(current_text_lines)
+    if expected_text == current_text:
+        return True
+    print("txt content don't match:")
+    print(f"expected: {expected_text}")
+    print(f"current: {current_text}")
+    return False
+file_type_equality_check = {
+    ".json": json_equality_check,
+    ".html": html_equality_check,
+    ".txt": txt_equality_check,
+}
 @dataclass
 class ValidationConfigs:
     test_id: str
@@ -39,6 +76,7 @@ class ValidationConfigs:
     )
     exclude_fields_extend: list[str] = field(default_factory=list)
     validate_downloaded_files: bool = False
+    validate_file_data: bool = True
     downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
     def get_exclude_fields(self) -> list[str]:
@@ -86,7 +124,7 @@ class ValidationConfigs:
 def get_files(dir_path: Path) -> list[str]:
     return [
-        str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
+        str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
     ]
@@ -122,6 +160,23 @@ def check_contents(
     assert not found_diff, f"Diffs found between files: {found_diff}"
+def detect_diff(
+    configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
+) -> bool:
+    if expected_filepath.suffix != current_filepath.suffix:
+        return True
+    if downloaded_file_equality_check := configs.downloaded_file_equality_check:
+        return not downloaded_file_equality_check(expected_filepath, current_filepath)
+    current_suffix = expected_filepath.suffix
+    if current_suffix in file_type_equality_check:
+        equality_check_callable = file_type_equality_check[current_suffix]
+        return not equality_check_callable(
+            expected_filepath=expected_filepath, current_filepath=current_filepath
+        )
+    # Fallback is using filecmp.cmp to compare the files
+    return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
 def check_raw_file_contents(
     expected_output_dir: Path,
     current_output_dir: Path,
@@ -133,15 +188,7 @@ def check_raw_file_contents(
     for current_file in current_files:
         current_file_path = current_output_dir / current_file
         expected_file_path = expected_output_dir / current_file
-        if downloaded_file_equality_check := configs.downloaded_file_equality_check:
-            is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
-        elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
-            is_different = not pandas_df_equality_check(
-                expected_filepath=expected_file_path, current_filepath=current_file_path
-            )
-        else:
-            is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
-        if is_different:
+        if detect_diff(configs, expected_file_path, current_file_path):
             found_diff = True
             files.append(str(expected_file_path))
             print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -185,17 +232,19 @@ def update_fixtures(
     download_dir: Path,
     all_file_data: list[FileData],
     save_downloads: bool = False,
+    save_filedata: bool = True,
 ):
     # Delete current files
     shutil.rmtree(path=output_dir, ignore_errors=True)
     output_dir.mkdir(parents=True)
     # Rewrite the current file data
-    file_data_output_path = output_dir / "file_data"
-    file_data_output_path.mkdir(parents=True, exist_ok=True)
-    for file_data in all_file_data:
-        file_data_path = file_data_output_path / f"{file_data.identifier}.json"
-        with file_data_path.open(mode="w") as f:
-            json.dump(file_data.to_dict(), f, indent=2)
+    if save_filedata:
+        file_data_output_path = output_dir / "file_data"
+        file_data_output_path.mkdir(parents=True, exist_ok=True)
+        for file_data in all_file_data:
+            file_data_path = file_data_output_path / f"{file_data.identifier}.json"
+            with file_data_path.open(mode="w") as f:
+                json.dump(file_data.to_dict(), f, indent=2)
     # Record file structure of download directory
     download_files = get_files(dir_path=download_dir)
@@ -229,11 +278,12 @@ def run_all_validations(
             predownload_file_data=pre_data, postdownload_file_data=post_data
         )
     configs.run_download_dir_validation(download_dir=download_dir)
-    run_expected_results_validation(
-        expected_output_dir=test_output_dir / "file_data",
-        all_file_data=postdownload_file_data,
-        configs=configs,
-    )
+    if configs.validate_file_data:
+        run_expected_results_validation(
+            expected_output_dir=test_output_dir / "file_data",
+            all_file_data=postdownload_file_data,
+            configs=configs,
+        )
     download_files = get_files(dir_path=download_dir)
     download_files.sort()
     run_directory_structure_validation(
@@ -291,4 +341,5 @@ async def source_connector_validation(
             download_dir=download_dir,
             all_file_data=all_postdownload_file_data,
             save_downloads=configs.validate_downloaded_files,
+            save_filedata=configs.validate_file_data,
         )

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.1" # pragma: no cover
1	+ __version__ = "0.2.2" # pragma: no cover

unstructured_ingest/connector/kafka.py CHANGED Viewed

@@ -181,7 +181,6 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
                     logger.debug(f"found {len(collected)} messages, stopping")
                     consumer.commit(asynchronous=False)
                     break
         return [
             KafkaIngestDoc(
                 connector_config=self.connector_config,

unstructured_ingest/interfaces.py CHANGED Viewed

@@ -21,6 +21,7 @@ from unstructured_ingest.enhanced_dataclass.core import _asdict
 from unstructured_ingest.error import PartitionError, SourceConnectionError
 from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.data_prep import flatten_dict
+from unstructured_ingest.v2.unstructured_api import call_api
 if TYPE_CHECKING:
     from unstructured.documents.elements import Element
@@ -565,6 +566,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
     ) -> list["Element"]:
         from unstructured.documents.elements import DataSourceMetadata
         from unstructured.partition.auto import partition
+        from unstructured.staging.base import elements_from_dicts
         if not partition_config.partition_by_api:
             logger.debug("Using local partition")
@@ -582,18 +584,16 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
                 **partition_kwargs,
             )
         else:
-            from unstructured.partition.api import partition_via_api
             endpoint = partition_config.partition_endpoint
             logger.debug(f"using remote partition ({endpoint})")
-            elements = partition_via_api(
-                filename=str(self.filename),
+            elements_dicts = call_api(
+                server_url=endpoint,
                 api_key=partition_config.api_key,
-                api_url=endpoint,
-                **partition_kwargs,
+                filename=Path(self.filename),
+                api_parameters=partition_kwargs,
             )
+            elements = elements_from_dicts(elements_dicts)
             # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
             # pass the stringified json here
         return elements

unstructured_ingest/v2/processes/chunker.py CHANGED Viewed

@@ -9,7 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces.process import BaseProcess
 from unstructured_ingest.v2.logger import logger
-from unstructured_ingest.v2.unstructured_api import call_api
+from unstructured_ingest.v2.unstructured_api import call_api_async
 CHUNK_MAX_CHARS_DEFAULT: int = 500
 CHUNK_MULTI_PAGE_DEFAULT: bool = True
@@ -112,7 +112,7 @@ class Chunker(BaseProcess, ABC):
     @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
     async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
-        elements = await call_api(
+        elements = await call_api_async(
             server_url=self.config.chunking_endpoint,
             api_key=self.config.chunk_api_key.get_secret_value(),
             filename=elements_filepath,

unstructured-ingest 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl