PyPI - unstructured-ingest - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

unstructured-ingest 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (24) hide show

test/integration/connectors/test_sharepoint.py CHANGED Viewed

@@ -19,24 +19,31 @@ from unstructured_ingest.v2.processes.connectors.sharepoint import (
 )
+def sharepoint_config():
+    class SharepointTestConfig:
+        def __init__(self):
+            self.client_id = os.environ["SHAREPOINT_CLIENT_ID"]
+            self.client_cred = os.environ["SHAREPOINT_CRED"]
+            self.user_pname = os.environ["MS_USER_PNAME"]
+            self.tenant = os.environ["MS_TENANT_ID"]
+    return SharepointTestConfig()
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
 @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
 async def test_sharepoint_source(temp_dir):
-    # Retrieve environment variables
     site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
-    client_id = os.environ["SHAREPOINT_CLIENT_ID"]
-    client_cred = os.environ["SHAREPOINT_CRED"]
-    user_pname = os.environ["MS_USER_PNAME"]
-    tenant = os.environ["MS_TENANT_ID"]
+    config = sharepoint_config()
     # Create connection and indexer configurations
-    access_config = SharepointAccessConfig(client_cred=client_cred)
+    access_config = SharepointAccessConfig(client_cred=config.client_cred)
     connection_config = SharepointConnectionConfig(
-        client_id=client_id,
+        client_id=config.client_id,
         site=site,
-        tenant=tenant,
-        user_pname=user_pname,
+        tenant=config.tenant,
+        user_pname=config.user_pname,
         access_config=access_config,
     )
     index_config = SharepointIndexerConfig(recursive=True)
@@ -58,7 +65,151 @@ async def test_sharepoint_source(temp_dir):
         indexer=indexer,
         downloader=downloader,
         configs=SourceValidationConfigs(
-            test_id="sharepoint",
+            test_id="sharepoint1",
+            expected_num_files=4,
+            validate_downloaded_files=True,
+            exclude_fields_extend=[
+                "metadata.date_created",
+                "metadata.date_modified",
+                "additional_metadata.LastModified",
+                "additional_metadata.@microsoft.graph.downloadUrl",
+            ],
+        ),
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
+@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
+async def test_sharepoint_source_with_path(temp_dir):
+    site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
+    config = sharepoint_config()
+    # Create connection and indexer configurations
+    access_config = SharepointAccessConfig(client_cred=config.client_cred)
+    connection_config = SharepointConnectionConfig(
+        client_id=config.client_id,
+        site=site,
+        tenant=config.tenant,
+        user_pname=config.user_pname,
+        access_config=access_config,
+    )
+    index_config = SharepointIndexerConfig(recursive=True, path="Folder1")
+    download_config = SharepointDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = SharepointIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = SharepointDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id="sharepoint2",
+            expected_num_files=2,
+            validate_downloaded_files=True,
+            exclude_fields_extend=[
+                "metadata.date_created",
+                "metadata.date_modified",
+                "additional_metadata.LastModified",
+                "additional_metadata.@microsoft.graph.downloadUrl",
+            ],
+        ),
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
+@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
+async def test_sharepoint_root_with_path(temp_dir):
+    site = "https://unstructuredio.sharepoint.com/"
+    config = sharepoint_config()
+    # Create connection and indexer configurations
+    access_config = SharepointAccessConfig(client_cred=config.client_cred)
+    connection_config = SharepointConnectionConfig(
+        client_id=config.client_id,
+        site=site,
+        tenant=config.tenant,
+        user_pname=config.user_pname,
+        access_config=access_config,
+    )
+    index_config = SharepointIndexerConfig(recursive=True, path="e2e-test-folder")
+    download_config = SharepointDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = SharepointIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = SharepointDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id="sharepoint3",
+            expected_num_files=1,
+            validate_downloaded_files=True,
+            exclude_fields_extend=[
+                "metadata.date_created",
+                "metadata.date_modified",
+                "additional_metadata.LastModified",
+                "additional_metadata.@microsoft.graph.downloadUrl",
+            ],
+        ),
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
+@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
+async def test_sharepoint_shared_documents(temp_dir):
+    site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
+    config = sharepoint_config()
+    # Create connection and indexer configurations
+    access_config = SharepointAccessConfig(client_cred=config.client_cred)
+    connection_config = SharepointConnectionConfig(
+        client_id=config.client_id,
+        site=site,
+        tenant=config.tenant,
+        user_pname=config.user_pname,
+        access_config=access_config,
+    )
+    index_config = SharepointIndexerConfig(recursive=True, path="Shared Documents")
+    download_config = SharepointDownloaderConfig(download_dir=temp_dir)
+    # Instantiate indexer and downloader
+    indexer = SharepointIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+    )
+    downloader = SharepointDownloader(
+        connection_config=connection_config,
+        download_config=download_config,
+    )
+    # Run the source connector validation
+    await source_connector_validation(
+        indexer=indexer,
+        downloader=downloader,
+        configs=SourceValidationConfigs(
+            test_id="sharepoint4",
             expected_num_files=4,
             validate_downloaded_files=True,
             exclude_fields_extend=[

test/unit/v2/embedders/test_bedrock.py CHANGED Viewed

@@ -15,7 +15,7 @@ def generate_embedder_config_params() -> dict:
         "region_name": fake.city(),
     }
     if random.random() < 0.5:
-        params["embed_model_name"] = fake.word()
+        params["embedder_model_name"] = fake.word()
     return params

test/unit/v2/embedders/test_huggingface.py CHANGED Viewed

@@ -16,7 +16,7 @@ fake = faker.Faker()
 def generate_embedder_config_params() -> dict:
     params = {}
     if random.random() < 0.5:
-        params["embed_model_name"] = fake.word() if random.random() < 0.5 else None
+        params["embedder_model_name"] = fake.word() if random.random() < 0.5 else None
         params["embedder_model_kwargs"] = (
             generate_random_dictionary(key_type=str, value_type=Any)
             if random.random() < 0.5

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.2" # pragma: no cover
1	+ __version__ = "0.5.4" # pragma: no cover

unstructured_ingest/embed/azure_openai.py CHANGED Viewed

@@ -44,7 +44,13 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
 class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
     config: AzureOpenAIEmbeddingConfig
+    def get_client(self) -> "AzureOpenAI":
+        return self.config.get_client()
 @dataclass
 class AsyncAzureOpenAIEmbeddingEncoder(AsyncOpenAIEmbeddingEncoder):
     config: AzureOpenAIEmbeddingConfig
+    def get_client(self) -> "AsyncAzureOpenAI":
+        return self.config.get_async_client()

unstructured_ingest/embed/bedrock.py CHANGED Viewed

@@ -15,7 +15,13 @@ from unstructured_ingest.embed.interfaces import (
 )
 from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
+from unstructured_ingest.v2.errors import (
+    ProviderError,
+    RateLimitError,
+    UserAuthError,
+    UserError,
+    is_internal_error,
+)
 if TYPE_CHECKING:
     from botocore.client import BaseClient
@@ -51,9 +57,11 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
     aws_access_key_id: SecretStr
     aws_secret_access_key: SecretStr
     region_name: str = "us-west-2"
-    embed_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
+    embedder_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
     def wrap_error(self, e: Exception) -> Exception:
+        if is_internal_error(e=e):
+            return e
         from botocore.exceptions import ClientError
         if isinstance(e, ClientError):
@@ -122,7 +130,7 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
     def embed_query(self, query: str) -> list[float]:
         """Call out to Bedrock embedding endpoint."""
-        provider = self.config.embed_model_name.split(".")[0]
+        provider = self.config.embedder_model_name.split(".")[0]
         body = conform_query(query=query, provider=provider)
         bedrock_client = self.config.get_client()
@@ -130,7 +138,7 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
         try:
             response = bedrock_client.invoke_model(
                 body=json.dumps(body),
-                modelId=self.config.embed_model_name,
+                modelId=self.config.embedder_model_name,
                 accept="application/json",
                 contentType="application/json",
             )
@@ -148,6 +156,8 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
     def embed_documents(self, elements: list[dict]) -> list[dict]:
         elements = elements.copy()
         elements_with_text = [e for e in elements if e.get("text")]
+        if not elements_with_text:
+            return elements
         embeddings = [self.embed_query(query=e["text"]) for e in elements_with_text]
         for element, embedding in zip(elements_with_text, embeddings):
             element[EMBEDDINGS_KEY] = embedding
@@ -163,7 +173,7 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     async def embed_query(self, query: str) -> list[float]:
         """Call out to Bedrock embedding endpoint."""
-        provider = self.config.embed_model_name.split(".")[0]
+        provider = self.config.embedder_model_name.split(".")[0]
         body = conform_query(query=query, provider=provider)
         try:
             async with self.config.get_async_client() as bedrock_client:
@@ -171,7 +181,7 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
                 try:
                     response = await bedrock_client.invoke_model(
                         body=json.dumps(body),
-                        modelId=self.config.embed_model_name,
+                        modelId=self.config.embedder_model_name,
                         accept="application/json",
                         contentType="application/json",
                     )

unstructured_ingest/embed/huggingface.py CHANGED Viewed

@@ -47,7 +47,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
 class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
     config: HuggingFaceEmbeddingConfig
-    def embed_query(self, query: str) -> list[float]:
+    def _embed_query(self, query: str) -> list[float]:
         return self._embed_documents(texts=[query])[0]
     def _embed_documents(self, texts: list[str]) -> list[list[float]]:
@@ -58,6 +58,8 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
     def embed_documents(self, elements: list[dict]) -> list[dict]:
         elements = elements.copy()
         elements_with_text = [e for e in elements if e.get("text")]
+        if not elements_with_text:
+            return elements
         embeddings = self._embed_documents([e["text"] for e in elements_with_text])
         for element, embedding in zip(elements_with_text, embeddings):
             element[EMBEDDINGS_KEY] = embedding

unstructured_ingest/embed/interfaces.py CHANGED Viewed

@@ -1,11 +1,12 @@
-import asyncio
-from abc import ABC, abstractmethod
+from abc import ABC
 from dataclasses import dataclass
-from typing import Optional
+from typing import Any, Optional
 import numpy as np
 from pydantic import BaseModel, Field
+from unstructured_ingest.utils.data_prep import batch_generator
 EMBEDDINGS_KEY = "embeddings"
@@ -50,21 +51,37 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    @abstractmethod
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        pass
+    def get_client(self):
+        raise NotImplementedError
-    @abstractmethod
-    def embed_query(self, query: str) -> list[float]:
-        pass
+    def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
+        raise NotImplementedError
-    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        results = []
-        for text in elements:
-            response = self.embed_query(query=text)
-            results.append(response)
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        client = self.get_client()
+        elements = elements.copy()
+        elements_with_text = [e for e in elements if e.get("text")]
+        texts = [e["text"] for e in elements_with_text]
+        embeddings = []
+        try:
+            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
+                embeddings = self.embed_batch(client=client, batch=batch)
+                embeddings.extend(embeddings)
+        except Exception as e:
+            raise self.wrap_error(e=e)
+        for element, embedding in zip(elements_with_text, embeddings):
+            element[EMBEDDINGS_KEY] = embedding
+        return elements
+    def _embed_query(self, query: str) -> list[float]:
+        client = self.get_client()
+        return self.embed_batch(client=client, batch=[query])[0]
-        return results
+    def embed_query(self, query: str) -> list[float]:
+        try:
+            return self._embed_query(query=query)
+        except Exception as e:
+            raise self.wrap_error(e=e)
 @dataclass
@@ -88,14 +105,35 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
         exemplary_embedding = await self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    @abstractmethod
+    def get_client(self):
+        raise NotImplementedError
+    async def embed_batch(self, client: Any, batch: list[str]) -> list[list[float]]:
+        raise NotImplementedError
     async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        pass
+        client = self.get_client()
+        elements = elements.copy()
+        elements_with_text = [e for e in elements if e.get("text")]
+        texts = [e["text"] for e in elements_with_text]
+        embeddings = []
+        try:
+            for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
+                embeddings = await self.embed_batch(client=client, batch=batch)
+                embeddings.extend(embeddings)
+        except Exception as e:
+            raise self.wrap_error(e=e)
+        for element, embedding in zip(elements_with_text, embeddings):
+            element[EMBEDDINGS_KEY] = embedding
+        return elements
+    async def _embed_query(self, query: str) -> list[float]:
+        client = self.get_client()
+        embeddings = await self.embed_batch(client=client, batch=[query])
+        return embeddings[0]
-    @abstractmethod
     async def embed_query(self, query: str) -> list[float]:
-        pass
-    async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
-        results = await asyncio.gather(*[self.embed_query(query=text) for text in elements])
-        return results
+        try:
+            return await self._embed_query(query=query)
+        except Exception as e:
+            raise self.wrap_error(e=e)

unstructured_ingest/embed/mixedbreadai.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
@@ -6,12 +5,10 @@ from typing import TYPE_CHECKING
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import (
-    EMBEDDINGS_KEY,
     AsyncBaseEmbeddingEncoder,
     BaseEmbeddingEncoder,
     EmbeddingConfig,
 )
-from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 USER_AGENT = "@mixedbread-ai/unstructured"
@@ -85,7 +82,7 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def get_exemplary_embedding(self) -> list[float]:
         """Get an exemplary embedding to determine dimensions and unit vector status."""
-        return self._embed(["Q"])[0]
+        return self.embed_query(query="Q")
     @requires_dependencies(
         ["mixedbread_ai"],
@@ -100,59 +97,19 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
             additional_headers={"User-Agent": USER_AGENT},
         )
-    def _embed(self, texts: list[str]) -> list[list[float]]:
-        """
-        Embed a list of texts using the Mixedbread AI API.
-        Args:
-            texts (list[str]): List of texts to embed.
-        Returns:
-            list[list[float]]: List of embeddings.
-        """
-        responses = []
-        client = self.config.get_client()
-        for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-            response = client.embeddings(
-                model=self.config.embedder_model_name,
-                normalized=True,
-                encoding_format=ENCODING_FORMAT,
-                truncation_strategy=TRUNCATION_STRATEGY,
-                request_options=self.get_request_options(),
-                input=batch,
-            )
-            responses.append(response)
-        return [item.embedding for response in responses for item in response.data]
-    def embed_documents(self, elements: list[dict]) -> list[dict]:
-        """
-        Embed a list of document elements.
-        Args:
-            elements (list[Element]): List of document elements.
-        Returns:
-            list[Element]: Elements with embeddings.
-        """
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = self._embed([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
-    def embed_query(self, query: str) -> list[float]:
-        """
-        Embed a query string.
-        Args:
-            query (str): Query string to embed.
-        Returns:
-            list[float]: Embedding of the query.
-        """
-        return self._embed([query])[0]
+    def get_client(self) -> "MixedbreadAI":
+        return self.config.get_client()
+    def embed_batch(self, client: "MixedbreadAI", batch: list[str]) -> list[list[float]]:
+        response = client.embeddings(
+            model=self.config.embedder_model_name,
+            normalized=True,
+            encoding_format=ENCODING_FORMAT,
+            truncation_strategy=TRUNCATION_STRATEGY,
+            request_options=self.get_request_options(),
+            input=batch,
+        )
+        return [datum.embedding for datum in response.data]
 @dataclass
@@ -162,8 +119,7 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     async def get_exemplary_embedding(self) -> list[float]:
         """Get an exemplary embedding to determine dimensions and unit vector status."""
-        embedding = await self._embed(["Q"])
-        return embedding[0]
+        return await self.embed_query(query="Q")
     @requires_dependencies(
         ["mixedbread_ai"],
@@ -178,58 +134,16 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
             additional_headers={"User-Agent": USER_AGENT},
         )
-    async def _embed(self, texts: list[str]) -> list[list[float]]:
-        """
-        Embed a list of texts using the Mixedbread AI API.
-        Args:
-            texts (list[str]): List of texts to embed.
-        Returns:
-            list[list[float]]: List of embeddings.
-        """
-        client = self.config.get_async_client()
-        tasks = []
-        for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
-            tasks.append(
-                client.embeddings(
-                    model=self.config.embedder_model_name,
-                    normalized=True,
-                    encoding_format=ENCODING_FORMAT,
-                    truncation_strategy=TRUNCATION_STRATEGY,
-                    request_options=self.get_request_options(),
-                    input=batch,
-                )
-            )
-        responses = await asyncio.gather(*tasks)
-        return [item.embedding for response in responses for item in response.data]
-    async def embed_documents(self, elements: list[dict]) -> list[dict]:
-        """
-        Embed a list of document elements.
-        Args:
-            elements (list[Element]): List of document elements.
-        Returns:
-            list[Element]: Elements with embeddings.
-        """
-        elements = elements.copy()
-        elements_with_text = [e for e in elements if e.get("text")]
-        embeddings = await self._embed([e["text"] for e in elements_with_text])
-        for element, embedding in zip(elements_with_text, embeddings):
-            element[EMBEDDINGS_KEY] = embedding
-        return elements
-    async def embed_query(self, query: str) -> list[float]:
-        """
-        Embed a query string.
-        Args:
-            query (str): Query string to embed.
-        Returns:
-            list[float]: Embedding of the query.
-        """
-        embedding = await self._embed([query])
-        return embedding[0]
+    def get_client(self) -> "AsyncMixedbreadAI":
+        return self.config.get_async_client()
+    async def embed_batch(self, client: "AsyncMixedbreadAI", batch: list[str]) -> list[list[float]]:
+        response = await client.embeddings(
+            model=self.config.embedder_model_name,
+            normalized=True,
+            encoding_format=ENCODING_FORMAT,
+            truncation_strategy=TRUNCATION_STRATEGY,
+            request_options=self.get_request_options(),
+            input=batch,
+        )
+        return [datum.embedding for datum in response.data]

unstructured-ingest 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl