PyPI - unstructured-ingest - Versions diffs - 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

unstructured-ingest 0.4.6py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (21) hide show

test/integration/connectors/test_onedrive.py CHANGED Viewed

@@ -5,22 +5,31 @@ from pathlib import Path
 import pytest
 from office365.graph_client import GraphClient
-from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, DESTINATION_TAG
+from test.integration.connectors.utils.constants import (
+    BLOB_STORAGE_TAG,
+    DESTINATION_TAG,
+    SOURCE_TAG,
+)
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
+    source_connector_validation,
+)
 from test.integration.utils import requires_env
 from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.onedrive import (
     CONNECTOR_TYPE,
     OnedriveAccessConfig,
     OnedriveConnectionConfig,
+    OnedriveDownloader,
+    OnedriveDownloaderConfig,
+    OnedriveIndexer,
+    OnedriveIndexerConfig,
     OnedriveUploader,
     OnedriveUploaderConfig,
 )
 @pytest.fixture
-@pytest.mark.xfail(
-    reason="Issues with test setup on the provider side."
-)  # TODO: remove line when issues are addressed
 def onedrive_test_folder() -> str:
     """
     Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
@@ -65,12 +74,46 @@ def get_connection_config():
     return connection_config
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
+@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
+async def test_onedrive_source(temp_dir):
+    connection_config = get_connection_config()
+    index_config = OnedriveIndexerConfig(recursive=True, path="eml")
+    download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = OnedriveIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = OnedriveDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id="onedrive",
+            expected_num_files=1,
+            validate_downloaded_files=True,
+            exclude_fields_extend=[
+                "metadata.date_created",
+                "metadata.date_modified",
+                "additional_metadata.LastModified",
+                "additional_metadata.@microsoft.graph.downloadUrl",
+            ],
+        ),
+    )
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
 @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
-@pytest.mark.xfail(
-    reason="Issues with test setup on the provider side."
-)  # TODO: remove line when issues are addressed
-def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
+def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
     """
     Integration test for the OneDrive destination connector.
@@ -107,10 +150,14 @@ def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
     client = connection_config.get_client()
     drive = client.users[user_pname].drive
+    # Workaround: File should not have .json in the metadata.filename it comes from embedder
     uploaded_file = (
-        drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
+        drive.root.get_by_path(f"{destination_fullpath}.json")
+        .select(["id", "name"])
+        .get()
+        .execute_query()
     )
     # Check if the file exists
     assert uploaded_file is not None
-    assert uploaded_file.name == upload_file.name
+    assert uploaded_file.name == f"{upload_file.name}.json"

test/integration/connectors/test_sharepoint.py ADDED Viewed

@@ -0,0 +1,71 @@
+import os
+import pytest
+from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
+from test.integration.connectors.utils.validation.source import (
+    SourceValidationConfigs,
+    source_connector_validation,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.processes.connectors.sharepoint import (
+    CONNECTOR_TYPE,
+    SharepointAccessConfig,
+    SharepointConnectionConfig,
+    SharepointDownloader,
+    SharepointDownloaderConfig,
+    SharepointIndexer,
+    SharepointIndexerConfig,
+)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
+@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
+async def test_sharepoint_source(temp_dir):
+    # Retrieve environment variables
+    site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
+    client_id = os.environ["SHAREPOINT_CLIENT_ID"]
+    client_cred = os.environ["SHAREPOINT_CRED"]
+    user_pname = os.environ["MS_USER_PNAME"]
+    tenant = os.environ["MS_TENANT_ID"]
+    # Create connection and indexer configurations
+    access_config = SharepointAccessConfig(client_cred=client_cred)
+    connection_config = SharepointConnectionConfig(
+        client_id=client_id,
+        site=site,
+        tenant=tenant,
+        user_pname=user_pname,
+        access_config=access_config,
+    )
+    index_config = SharepointIndexerConfig(recursive=True)
+    download_config = SharepointDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = SharepointIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = SharepointDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id="sharepoint",
+            expected_num_files=4,
+            validate_downloaded_files=True,
+            exclude_fields_extend=[
+                "metadata.date_created",
+                "metadata.date_modified",
+                "additional_metadata.LastModified",
+                "additional_metadata.@microsoft.graph.downloadUrl",
+            ],
+        ),
+    )

test/integration/connectors/utils/validation/source.py CHANGED Viewed

@@ -10,6 +10,13 @@ from pydantic import Field
 from test.integration.connectors.utils.validation.utils import ValidationConfig
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
+NONSTANDARD_METADATA_FIELDS = {
+    "additional_metadata.@microsoft.graph.downloadUrl": [
+        "additional_metadata",
+        "@microsoft.graph.downloadUrl",
+    ]
+}
 class SourceValidationConfigs(ValidationConfig):
     expected_number_indexed_file_data: Optional[int] = None
@@ -26,7 +33,7 @@ class SourceValidationConfigs(ValidationConfig):
     def get_exclude_fields(self) -> list[str]:
         exclude_fields = self.exclude_fields
         exclude_fields.extend(self.exclude_fields_extend)
-        return exclude_fields
+        return list(set(exclude_fields))
     def run_file_data_validation(
         self, predownload_file_data: FileData, postdownload_file_data: FileData
@@ -45,8 +52,13 @@ class SourceValidationConfigs(ValidationConfig):
         exclude_fields = self.get_exclude_fields()
         # Ignore fields that dynamically change every time the tests run
         copied_data = data.copy()
         for exclude_field in exclude_fields:
-            exclude_field_vals = exclude_field.split(".")
+            exclude_field_vals = (
+                NONSTANDARD_METADATA_FIELDS[exclude_field]
+                if exclude_field in NONSTANDARD_METADATA_FIELDS
+                else exclude_field.split(".")
+            )
             if len(exclude_field_vals) == 1:
                 current_val = copied_data
                 drop_field = exclude_field_vals[0]
@@ -261,21 +273,38 @@ async def source_connector_validation(
     indexer.precheck()
     download_dir = downloader.download_config.download_dir
     test_output_dir = configs.test_output_dir()
-    for file_data in indexer.run():
-        assert file_data
-        predownload_file_data = file_data.model_copy(deep=True)
-        all_predownload_file_data.append(predownload_file_data)
-        if downloader.is_async():
-            resp = await downloader.run_async(file_data=file_data)
-        else:
-            resp = downloader.run(file_data=file_data)
-        if isinstance(resp, list):
-            for r in resp:
-                postdownload_file_data = r["file_data"].model_copy(deep=True)
+    if indexer.is_async():
+        async for file_data in indexer.run_async():
+            assert file_data
+            predownload_file_data = file_data.model_copy(deep=True)
+            all_predownload_file_data.append(predownload_file_data)
+            if downloader.is_async():
+                resp = await downloader.run_async(file_data=file_data)
+            else:
+                resp = downloader.run(file_data=file_data)
+            if isinstance(resp, list):
+                for r in resp:
+                    postdownload_file_data = r["file_data"].model_copy(deep=True)
+                    all_postdownload_file_data.append(postdownload_file_data)
+            else:
+                postdownload_file_data = resp["file_data"].model_copy(deep=True)
+                all_postdownload_file_data.append(postdownload_file_data)
+    else:
+        for file_data in indexer.run():
+            assert file_data
+            predownload_file_data = file_data.model_copy(deep=True)
+            all_predownload_file_data.append(predownload_file_data)
+            if downloader.is_async():
+                resp = await downloader.run_async(file_data=file_data)
+            else:
+                resp = downloader.run(file_data=file_data)
+            if isinstance(resp, list):
+                for r in resp:
+                    postdownload_file_data = r["file_data"].model_copy(deep=True)
+                    all_postdownload_file_data.append(postdownload_file_data)
+            else:
+                postdownload_file_data = resp["file_data"].model_copy(deep=True)
                 all_postdownload_file_data.append(postdownload_file_data)
-        else:
-            postdownload_file_data = resp["file_data"].model_copy(deep=True)
-            all_postdownload_file_data.append(postdownload_file_data)
     if not overwrite_fixtures:
         print("Running validation")
         run_all_validations(

test/integration/embedders/test_bedrock.py CHANGED Viewed

@@ -31,7 +31,7 @@ def get_aws_credentials() -> dict:
 def test_bedrock_embedder(embedder_file: Path):
     aws_credentials = get_aws_credentials()
     embedder_config = EmbedderConfig(
-        embedding_provider="aws-bedrock",
+        embedding_provider="bedrock",
         embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
         embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
     )

test/integration/partitioners/test_partitioner.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import json
 import os
 from pathlib import Path
@@ -15,6 +14,9 @@ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
 non_image_partition_files = [
     path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
 ]
+supported_fast_partition_files = [
+    path for path in non_image_partition_files if path.suffix != ".eml"
+]
 image_partition_files = [
     path for path in all_partition_files if path not in non_image_partition_files
 ]
@@ -33,18 +35,13 @@ async def test_partitioner_api_hi_res(partition_file: Path):
     )
     partitioner = Partitioner(config=partitioner_config)
     results = await partitioner.run_async(filename=partition_file)
-    results_dir = int_test_dir / "results"
-    results_dir.mkdir(exist_ok=True)
-    results_path = results_dir / f"{partition_file.name}.json"
-    with results_path.open("w") as f:
-        json.dump(results, f, indent=2)
     assert results
 @pytest.mark.parametrize(
     "partition_file",
-    non_image_partition_files,
-    ids=[path.name for path in non_image_partition_files],
+    supported_fast_partition_files,
+    ids=[path.name for path in supported_fast_partition_files],
 )
 @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
 @pytest.mark.asyncio
@@ -68,7 +65,11 @@ async def test_partitioner_api_fast_error(partition_file: Path):
     api_key = os.getenv("UNSTRUCTURED_API_KEY")
     api_url = os.getenv("UNSTRUCTURED_API_URL")
     partitioner_config = PartitionerConfig(
-        strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
+        strategy="fast",
+        partition_by_api=True,
+        api_key=api_key,
+        partition_endpoint=api_url,
+        raise_unsupported_filetype=True,
     )
     partitioner = Partitioner(config=partitioner_config)
     with pytest.raises(UserError):

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.6" # pragma: no cover
1	+ __version__ = "0.5.0" # pragma: no cover

unstructured_ingest/cli/interfaces.py CHANGED Viewed

@@ -417,7 +417,7 @@ class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
         embed_providers = [
             "openai",
             "huggingface",
-            "aws-bedrock",
+            "bedrock",
             "vertexai",
             "voyageai",
             "octoai",

unstructured_ingest/embed/azure_openai.py CHANGED Viewed

@@ -3,11 +3,15 @@ from typing import TYPE_CHECKING
 from pydantic import Field
-from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
+from unstructured_ingest.embed.openai import (
+    AsyncOpenAIEmbeddingEncoder,
+    OpenAIEmbeddingConfig,
+    OpenAIEmbeddingEncoder,
+)
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from openai import AzureOpenAI
+    from openai import AsyncAzureOpenAI, AzureOpenAI
 class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
@@ -25,7 +29,22 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
             azure_endpoint=self.azure_endpoint,
         )
+    @requires_dependencies(["openai"], extras="openai")
+    def get_async_client(self) -> "AsyncAzureOpenAI":
+        from openai import AsyncAzureOpenAI
+        return AsyncAzureOpenAI(
+            api_key=self.api_key.get_secret_value(),
+            api_version=self.api_version,
+            azure_endpoint=self.azure_endpoint,
+        )
 @dataclass
 class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
     config: AzureOpenAIEmbeddingConfig
+@dataclass
+class AsyncAzureOpenAIEmbeddingEncoder(AsyncOpenAIEmbeddingEncoder):
+    config: AzureOpenAIEmbeddingConfig

unstructured_ingest/interfaces.py CHANGED Viewed

@@ -226,7 +226,7 @@ class EmbeddingConfig(BaseConfig):
             )
             return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
-        elif self.provider == "aws-bedrock":
+        elif self.provider == "bedrock":
             from unstructured_ingest.embed.bedrock import (
                 BedrockEmbeddingConfig,
                 BedrockEmbeddingEncoder,

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -268,6 +268,7 @@ class Pipeline:
         # Partition content
         elements = self.partitioner_step(downloaded_data)
+        elements = self.clean_results(results=elements)
         # Download data non longer needed, delete if possible
         self.downloader_step.delete_cache()
         elements = self.clean_results(results=elements)
@@ -329,9 +330,9 @@ class Pipeline:
         source_entry = {
             k: v
             for k, v in source_registry.items()
-            if isinstance(indexer_config, v.indexer_config)
-            and isinstance(downloader_config, v.downloader_config)
-            and isinstance(source_connection_config, v.connection_config)
+            if type(indexer_config) is v.indexer_config
+            and type(downloader_config) is v.downloader_config
+            and type(source_connection_config) is v.connection_config
         }
         if len(source_entry) > 1:
             raise ValueError(

unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json ADDED Viewed

@@ -0,0 +1,23 @@
+{
+    "properties": [
+        {
+            "dataType": [
+                "text"
+            ],
+            "indexFilterable": true,
+            "indexSearchable": true,
+            "name": "record_id",
+            "tokenization": "word"
+        },
+        {
+            "dataType": [
+                "text"
+            ],
+            "indexFilterable": true,
+            "indexSearchable": true,
+            "name": "text",
+            "tokenization": "word"
+        }
+    ],
+    "vectorizer": "none"
+}

unstructured_ingest/v2/processes/connectors/onedrive.py CHANGED Viewed

@@ -105,6 +105,7 @@ class OnedriveIndexerConfig(IndexerConfig):
 class OnedriveIndexer(Indexer):
     connection_config: OnedriveConnectionConfig
     index_config: OnedriveIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
     def precheck(self) -> None:
         try:
@@ -172,7 +173,7 @@ class OnedriveIndexer(Indexer):
         )
         return FileData(
             identifier=drive_item.id,
-            connector_type=CONNECTOR_TYPE,
+            connector_type=self.connector_type,
             source_identifiers=SourceIdentifiers(
                 fullpath=server_path, filename=drive_item.name, rel_path=rel_path
             ),
@@ -201,7 +202,8 @@ class OnedriveIndexer(Indexer):
         token_resp = await asyncio.to_thread(self.connection_config.get_token)
         if "error" in token_resp:
             raise SourceConnectionError(
-                f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
+                f"[{self.connector_type}]: {token_resp['error']} "
+                f"({token_resp.get('error_description')})"
             )
         client = await asyncio.to_thread(self.connection_config.get_client)
@@ -221,6 +223,7 @@ class OnedriveDownloaderConfig(DownloaderConfig):
 class OnedriveDownloader(Downloader):
     connection_config: OnedriveConnectionConfig
     download_config: OnedriveDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
     @SourceConnectionNetworkError.wrap
     def _fetch_file(self, file_data: FileData) -> DriveItem:
@@ -260,7 +263,9 @@ class OnedriveDownloader(Downloader):
                     file.download_session(f).execute_query()
             return self.generate_download_response(file_data=file_data, download_path=download_path)
         except Exception as e:
-            logger.error(f"[{CONNECTOR_TYPE}] Exception during downloading: {e}", exc_info=True)
+            logger.error(
+                f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
+            )
             # Re-raise to see full stack trace locally
             raise

unstructured-ingest 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.4.6py3-none-any.whl → 0.5.0py3-none-any.whl