PyPI - unstructured-ingest - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

unstructured-ingest 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (53) hide show

unstructured_ingest/v2/processes/connectors/vectara.py ADDED Viewed

@@ -0,0 +1,350 @@
+import asyncio
+import json
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Mapping, Optional
+from pydantic import Field, Secret
+from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.data_prep import flatten_dict
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    FileData,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+)
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
+BASE_URL = "https://api.vectara.io/v2"
+CONNECTOR_TYPE = "vectara"
+class VectaraAccessConfig(AccessConfig):
+    oauth_client_id: str = Field(description="Client ID")
+    oauth_secret: str = Field(description="Client Secret")
+class VectaraConnectionConfig(ConnectionConfig):
+    access_config: Secret[VectaraAccessConfig]
+    customer_id: str
+    corpus_name: Optional[str] = None
+    corpus_key: Optional[str] = None
+    token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token"
+class VectaraUploadStagerConfig(UploadStagerConfig):
+    pass
+@dataclass
+class VectaraUploadStager(UploadStager):
+    upload_stager_config: VectaraUploadStagerConfig = field(
+        default_factory=lambda: VectaraUploadStagerConfig()
+    )
+    @staticmethod
+    def conform_dict(data: dict) -> dict:
+        """
+        Prepares dictionary in the format that Vectara requires.
+        See more detail in https://docs.vectara.com/docs/rest-api/create-corpus-document
+        Select which meta-data fields to include and optionally map them to a new format.
+        remove the "metadata-" prefix from the keys
+        """
+        metadata_map = {
+            "page_number": "page_number",
+            "data_source-url": "url",
+            "filename": "filename",
+            "filetype": "filetype",
+            "last_modified": "last_modified",
+            "element_id": "element_id",
+        }
+        md = flatten_dict(data, separator="-", flatten_lists=True)
+        md = {k.replace("metadata-", ""): v for k, v in md.items()}
+        md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map}
+        return md
+    def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
+        with input_file.open() as in_f:
+            elements_contents = json.load(in_f)
+        logger.info(
+            f"Extending {len(elements_contents)} json elements from content in {input_file}"
+        )
+        conformed_elements = [
+            {
+                "id": str(uuid.uuid4()),
+                "type": "core",
+                "metadata": {
+                    "title": file_data.identifier,
+                },
+                "document_parts": [
+                    {
+                        "text": element.pop("text", None),
+                        "metadata": self.conform_dict(data=element),
+                    }
+                    for element in elements_contents
+                ],
+            }
+        ]
+        with open(output_file, "w") as out_f:
+            json.dump(conformed_elements, out_f, indent=2)
+class VectaraUploaderConfig(UploaderConfig):
+    pass
+@dataclass
+class VectaraUploader(Uploader):
+    connector_type: str = CONNECTOR_TYPE
+    upload_config: VectaraUploaderConfig
+    connection_config: VectaraConnectionConfig
+    _jwt_token: Optional[str] = field(init=False, default=None)
+    _jwt_token_expires_ts: Optional[float] = field(init=False, default=None)
+    def is_async(self) -> bool:
+        return True
+    def precheck(self) -> None:
+        try:
+            self._check_connection_and_corpora()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    @property
+    async def jwt_token_async(self) -> str:
+        if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
+            self._jwt_token = await self._get_jwt_token_async()
+        return self._jwt_token
+    @property
+    def jwt_token(self) -> str:
+        if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
+            self._jwt_token = self._get_jwt_token()
+        return self._jwt_token
+    # Get Oauth2 JWT token
+    @requires_dependencies(["httpx"], extras="vectara")
+    async def _get_jwt_token_async(self) -> str:
+        import httpx
+        """Connect to the server and get a JWT token."""
+        token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
+        headers = {
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+        data = {
+            "grant_type": "client_credentials",
+            "client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
+            "client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
+        }
+        async with httpx.AsyncClient() as client:
+            response = await client.post(token_endpoint, headers=headers, data=data)
+            response.raise_for_status()
+            response_json = response.json()
+        request_time = datetime.now().timestamp()
+        self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
+        return response_json.get("access_token")
+    # Get Oauth2 JWT token
+    @requires_dependencies(["httpx"], extras="vectara")
+    def _get_jwt_token(self) -> str:
+        import httpx
+        """Connect to the server and get a JWT token."""
+        token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
+        headers = {
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+        data = {
+            "grant_type": "client_credentials",
+            "client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
+            "client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
+        }
+        with httpx.Client() as client:
+            response = client.post(token_endpoint, headers=headers, data=data)
+            response.raise_for_status()
+            response_json = response.json()
+        request_time = datetime.now().timestamp()
+        self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
+        return response_json.get("access_token")
+    @DestinationConnectionError.wrap
+    def _check_connection_and_corpora(self) -> None:
+        """
+        Check the connection for Vectara and validate corpus exists.
+        - If more than one corpus with the same name exists - raise error
+        - If exactly one corpus exists with this name - use it.
+        - If does not exist - raise error.
+        """
+        # Get token if not already set
+        self.jwt_token
+        _, list_corpora_response = self._request(
+            http_method="GET",
+            endpoint="corpora",
+        )
+        if self.connection_config.corpus_name:
+            possible_corpora_keys_names_map = {
+                corpus.get("key"): corpus.get("name")
+                for corpus in list_corpora_response.get("corpora")
+                if corpus.get("name") == self.connection_config.corpus_name
+            }
+            if len(possible_corpora_keys_names_map) > 1:
+                raise ValueError(
+                    f"Multiple Corpus exist with name {self.connection_config.corpus_name} in dest."
+                )
+            if len(possible_corpora_keys_names_map) == 1:
+                if not self.connection_config.corpus_key:
+                    self.connection_config.corpus_key = list(
+                        possible_corpora_keys_names_map.keys()
+                    )[0]
+                elif (
+                    self.connection_config.corpus_key
+                    != list(possible_corpora_keys_names_map.keys())[0]
+                ):
+                    raise ValueError("Corpus key does not match provided corpus name.")
+            else:
+                raise ValueError(
+                    f"No Corpora exist with name {self.connection_config.corpus_name} in dest."
+                )
+    @requires_dependencies(["httpx"], extras="vectara")
+    async def _async_request(
+        self,
+        endpoint: str,
+        http_method: str = "POST",
+        params: Mapping[str, Any] = None,
+        data: Mapping[str, Any] = None,
+    ) -> tuple[bool, dict]:
+        import httpx
+        url = f"{BASE_URL}/{endpoint}"
+        headers = {
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            "Authorization": f"Bearer {await self.jwt_token_async}",
+            "X-source": "unstructured",
+        }
+        async with httpx.AsyncClient() as client:
+            response = await client.request(
+                method=http_method, url=url, headers=headers, params=params, json=data
+            )
+            response.raise_for_status()
+            return response.json()
+    @requires_dependencies(["httpx"], extras="vectara")
+    def _request(
+        self,
+        endpoint: str,
+        http_method: str = "POST",
+        params: Mapping[str, Any] = None,
+        data: Mapping[str, Any] = None,
+    ) -> tuple[bool, dict]:
+        import httpx
+        url = f"{BASE_URL}/{endpoint}"
+        headers = {
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            "Authorization": f"Bearer {self.jwt_token}",
+            "X-source": "unstructured",
+        }
+        with httpx.Client() as client:
+            response = client.request(
+                method=http_method, url=url, headers=headers, params=params, json=data
+            )
+            response.raise_for_status()
+            return response.json()
+    async def _delete_doc(self, doc_id: str) -> tuple[bool, dict]:
+        """
+        Delete a document from the Vectara corpus.
+        """
+        return await self._async_request(
+            endpoint=f"corpora/{self.connection_config.corpus_key}/documents/{doc_id}",
+            http_method="DELETE",
+        )
+    async def _index_document(self, document: Dict[str, Any]) -> None:
+        """
+        Index a document (by uploading it to the Vectara corpus) from the document dictionary
+        """
+        logger.debug(
+            f"Indexing document {document['id']} to corpus key {self.connection_config.corpus_key}"
+        )
+        try:
+            result = await self._async_request(
+                endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
+            )
+        except Exception as e:
+            logger.error(f"exception {e} while indexing document {document['id']}")
+            return
+        if (
+            "messages" in result
+            and result["messages"]
+            and (
+                "ALREADY_EXISTS" in result["messages"]
+                or (
+                    "CONFLICT: Indexing doesn't support updating documents."
+                    in result["messages"][0]
+                )
+            )
+        ):
+            logger.info(f"document {document['id']} already exists, re-indexing")
+            await self._delete_doc(document["id"])
+            await self._async_request(
+                endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
+            )
+            return
+        logger.info(f"indexing document {document['id']} succeeded")
+    async def run_data_async(
+        self,
+        data: list[dict],
+        file_data: FileData,
+        **kwargs: Any,
+    ) -> None:
+        logger.info(f"inserting / updating {len(data)} documents to Vectara ")
+        await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))
+vectara_destination_entry = DestinationRegistryEntry(
+    connection_config=VectaraConnectionConfig,
+    uploader=VectaraUploader,
+    uploader_config=VectaraUploaderConfig,
+    upload_stager=VectaraUploadStager,
+    upload_stager_config=VectaraUploadStagerConfig,
+)

unstructured_ingest/v2/unstructured_api.py CHANGED Viewed

@@ -2,6 +2,7 @@ from dataclasses import fields
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
+from unstructured_ingest.v2.errors import ProviderError, UserError
 from unstructured_ingest.v2.logger import logger
 if TYPE_CHECKING:
@@ -51,6 +52,22 @@ def create_partition_request(filename: Path, parameters_dict: dict) -> "Partitio
     return PartitionRequest(partition_parameters=partition_params)
+def handle_error(e: Exception):
+    from unstructured_client.models.errors.sdkerror import SDKError
+    if isinstance(e, SDKError):
+        logger.error(f"Error calling Unstructured API: {e}")
+        if 400 <= e.status_code < 500:
+            raise UserError(e.body)
+        elif e.status_code >= 500:
+            raise ProviderError(e.body)
+        else:
+            raise e
+    else:
+        logger.error(f"Uncaught Error calling API: {e}")
+        raise e
 async def call_api_async(
     server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
 ) -> list[dict]:
@@ -71,7 +88,10 @@ async def call_api_async(
         api_key_auth=api_key,
     )
     partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
-    res = await client.general.partition_async(request=partition_request)
+    try:
+        res = await client.general.partition_async(request=partition_request)
+    except Exception as e:
+        handle_error(e)
     return res.elements or []
@@ -96,6 +116,9 @@ def call_api(
         api_key_auth=api_key,
     )
     partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
-    res = client.general.partition(request=partition_request)
+    try:
+        res = client.general.partition(request=partition_request)
+    except Exception as e:
+        handle_error(e)
     return res.elements or []

{unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: unstructured-ingest
-Version: 0.3.10
+Version: 0.3.12
 Summary: A library that prepares raw documents for downstream ML tasks.
 Home-page: https://github.com/Unstructured-IO/unstructured-ingest
 Author: Unstructured Technologies
@@ -22,14 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9.0,<3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
-Requires-Dist: opentelemetry-sdk
-Requires-Dist: python-dateutil
-Requires-Dist: click
 Requires-Dist: ndjson
+Requires-Dist: python-dateutil
 Requires-Dist: pydantic>=2.7
 Requires-Dist: pandas
-Requires-Dist: tqdm
 Requires-Dist: dataclasses-json
+Requires-Dist: tqdm
+Requires-Dist: click
+Requires-Dist: opentelemetry-sdk
 Provides-Extra: airtable
 Requires-Dist: pyairtable; extra == "airtable"
 Provides-Extra: astradb
@@ -42,11 +42,11 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
 Provides-Extra: bedrock
 Requires-Dist: boto3; extra == "bedrock"
 Provides-Extra: biomed
-Requires-Dist: bs4; extra == "biomed"
 Requires-Dist: requests; extra == "biomed"
+Requires-Dist: bs4; extra == "biomed"
 Provides-Extra: box
-Requires-Dist: fsspec; extra == "box"
 Requires-Dist: boxfs; extra == "box"
+Requires-Dist: fsspec; extra == "box"
 Provides-Extra: chroma
 Requires-Dist: chromadb; extra == "chroma"
 Provides-Extra: clarifai
@@ -90,9 +90,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
 Provides-Extra: epub
 Requires-Dist: unstructured[epub]; extra == "epub"
 Provides-Extra: gcs
+Requires-Dist: gcsfs; extra == "gcs"
 Requires-Dist: bs4; extra == "gcs"
 Requires-Dist: fsspec; extra == "gcs"
-Requires-Dist: gcsfs; extra == "gcs"
 Provides-Extra: github
 Requires-Dist: pygithub>1.58.0; extra == "github"
 Requires-Dist: requests; extra == "github"
@@ -101,8 +101,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
 Provides-Extra: google-drive
 Requires-Dist: google-api-python-client; extra == "google-drive"
 Provides-Extra: hubspot
-Requires-Dist: hubspot-api-client; extra == "hubspot"
 Requires-Dist: urllib3; extra == "hubspot"
+Requires-Dist: hubspot-api-client; extra == "hubspot"
 Provides-Extra: jira
 Requires-Dist: atlassian-python-api; extra == "jira"
 Provides-Extra: kafka
@@ -120,20 +120,20 @@ Requires-Dist: pymongo; extra == "mongodb"
 Provides-Extra: msg
 Requires-Dist: unstructured[msg]; extra == "msg"
 Provides-Extra: neo4j
-Requires-Dist: cymple; extra == "neo4j"
 Requires-Dist: neo4j; extra == "neo4j"
 Requires-Dist: networkx; extra == "neo4j"
+Requires-Dist: cymple; extra == "neo4j"
 Provides-Extra: notion
-Requires-Dist: htmlBuilder; extra == "notion"
 Requires-Dist: backoff; extra == "notion"
-Requires-Dist: notion-client; extra == "notion"
+Requires-Dist: htmlBuilder; extra == "notion"
 Requires-Dist: httpx; extra == "notion"
+Requires-Dist: notion-client; extra == "notion"
 Provides-Extra: odt
 Requires-Dist: unstructured[odt]; extra == "odt"
 Provides-Extra: onedrive
-Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
 Requires-Dist: bs4; extra == "onedrive"
 Requires-Dist: msal; extra == "onedrive"
+Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
 Provides-Extra: openai
 Requires-Dist: openai; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -142,8 +142,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
 Provides-Extra: org
 Requires-Dist: unstructured[org]; extra == "org"
 Provides-Extra: outlook
-Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
 Requires-Dist: msal; extra == "outlook"
+Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
 Provides-Extra: pdf
 Requires-Dist: unstructured[pdf]; extra == "pdf"
 Provides-Extra: pinecone
@@ -158,6 +158,8 @@ Provides-Extra: qdrant
 Requires-Dist: qdrant-client; extra == "qdrant"
 Provides-Extra: reddit
 Requires-Dist: praw; extra == "reddit"
+Provides-Extra: redis
+Requires-Dist: redis; extra == "redis"
 Provides-Extra: remote
 Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
 Provides-Extra: rst
@@ -170,11 +172,11 @@ Requires-Dist: fsspec; extra == "s3"
 Provides-Extra: salesforce
 Requires-Dist: simple-salesforce; extra == "salesforce"
 Provides-Extra: sftp
-Requires-Dist: fsspec; extra == "sftp"
 Requires-Dist: paramiko; extra == "sftp"
+Requires-Dist: fsspec; extra == "sftp"
 Provides-Extra: sharepoint
-Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
 Requires-Dist: msal; extra == "sharepoint"
+Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
 Provides-Extra: singlestore
 Requires-Dist: singlestoredb; extra == "singlestore"
 Provides-Extra: slack
@@ -187,6 +189,8 @@ Requires-Dist: together; extra == "togetherai"
 Provides-Extra: tsv
 Requires-Dist: unstructured[tsv]; extra == "tsv"
 Provides-Extra: vectara
+Requires-Dist: httpx; extra == "vectara"
+Requires-Dist: aiofiles; extra == "vectara"
 Requires-Dist: requests; extra == "vectara"
 Provides-Extra: weaviate
 Requires-Dist: weaviate-client; extra == "weaviate"

unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl