PyPI - unstructured-ingest - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

unstructured-ingest 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (14) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.19" # pragma: no cover
1	+ __version__ = "0.0.21" # pragma: no cover

unstructured_ingest/cli/cmds/astradb.py CHANGED Viewed

@@ -37,11 +37,11 @@ class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
                 "numbers, and underscores.",
             ),
             click.Option(
-                ["--namespace"],
+                ["--keyspace"],
                 required=False,
                 default=None,
                 type=str,
-                help="The Astra DB connection namespace.",
+                help="The Astra DB connection keyspace.",
             ),
         ]
         return options

unstructured_ingest/connector/astradb.py CHANGED Viewed

@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if t.TYPE_CHECKING:
-    from astrapy.db import AstraDB, AstraDBCollection
+    from astrapy import Collection as AstraDBCollection
+    from astrapy import Database as AstraDB
 NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
@@ -39,6 +40,7 @@ class AstraDBAccessConfig(AccessConfig):
 class SimpleAstraDBConfig(BaseConnectorConfig):
     access_config: AstraDBAccessConfig
     collection_name: str
+    keyspace: t.Optional[str] = None
     namespace: t.Optional[str] = None
@@ -98,22 +100,30 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
     @requires_dependencies(["astrapy"], extras="astradb")
     def astra_db_collection(self) -> "AstraDBCollection":
         if self._astra_db_collection is None:
-            from astrapy.db import AstraDB
+            from astrapy import DataAPIClient as AstraDBClient
-            # Build the Astra DB object.
+            # Choose keyspace or deprecated namespace
+            keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
+            # Create a client object to interact with the Astra DB
             # caller_name/version for Astra DB tracking
-            self._astra_db = AstraDB(
-                api_endpoint=self.connector_config.access_config.api_endpoint,
-                token=self.connector_config.access_config.token,
-                namespace=self.connector_config.namespace,
+            my_client = AstraDBClient(
                 caller_name=integration_name,
                 caller_version=integration_version,
             )
-            # Create and connect to the collection
-            self._astra_db_collection = self._astra_db.collection(
-                collection_name=self.connector_config.collection_name,
+            # Get the database object
+            self._astra_db = my_client.get_database(
+                api_endpoint=self.connector_config.access_config.api_endpoint,
+                token=self.connector_config.access_config.token,
+                keyspace=keyspace_param,
             )
+            # Create and connect to the newly created collection
+            self._astra_db_collection = self._astra_db.get_collection(
+                name=self.connector_config.collection_name,
+            )
         return self._astra_db_collection  # type: ignore
     @requires_dependencies(["astrapy"], extras="astradb")
@@ -132,8 +142,14 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
     @requires_dependencies(["astrapy"], extras="astradb")
     def get_ingest_docs(self):  # type: ignore
         # Perform the find operation
-        astra_db_docs = list(self.astra_db_collection.paginated_find())
+        astra_db_docs_cursor = self.astra_db_collection.find({})
+        # Iterate over the cursor
+        astra_db_docs = []
+        for result in astra_db_docs_cursor:
+            astra_db_docs.append(result)
+        # Create a list of AstraDBIngestDoc objects
         doc_list = []
         for record in astra_db_docs:
             doc = AstraDBIngestDoc(
@@ -182,30 +198,41 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
     @requires_dependencies(["astrapy"], extras="astradb")
     def astra_db_collection(self) -> "AstraDBCollection":
         if self._astra_db_collection is None:
-            from astrapy.db import AstraDB
+            from astrapy import DataAPIClient as AstraDBClient
+            from astrapy.exceptions import CollectionAlreadyExistsException
+            # Choose keyspace or deprecated namespace
+            keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
             collection_name = self.connector_config.collection_name
             embedding_dimension = self.write_config.embedding_dimension
-            # If the user has requested an indexing policy, pass it to the Astra DB
             requested_indexing_policy = self.write_config.requested_indexing_policy
-            options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
+            # Create a client object to interact with the Astra DB
             # caller_name/version for Astra DB tracking
-            self._astra_db = AstraDB(
-                api_endpoint=self.connector_config.access_config.api_endpoint,
-                token=self.connector_config.access_config.token,
-                namespace=self.connector_config.namespace,
+            my_client = AstraDBClient(
                 caller_name=integration_name,
                 caller_version=integration_version,
             )
-            # Create and connect to the newly created collection
-            self._astra_db_collection = self._astra_db.create_collection(
-                collection_name=collection_name,
-                dimension=embedding_dimension,
-                options=options,
+            # Get the database object
+            self._astra_db = my_client.get_database(
+                api_endpoint=self.connector_config.access_config.api_endpoint,
+                token=self.connector_config.access_config.token,
+                keyspace=keyspace_param,
             )
+            # Create and connect to the newly created collection
+            try:
+                self._astra_db_collection = self._astra_db.create_collection(
+                    name=collection_name,
+                    dimension=embedding_dimension,
+                    indexing=requested_indexing_policy,
+                )
+            except CollectionAlreadyExistsException as e:
+                logger.info(f"{e}", exc_info=True)
+                self._astra_db_collection = self._astra_db.get_collection(name=collection_name)
         return self._astra_db_collection
     @requires_dependencies(["astrapy"], extras="astradb")
@@ -224,6 +251,9 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
     def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
         logger.info(f"inserting / updating {len(elements_dict)} documents to Astra DB.")
+        if self._astra_db_collection is None:
+            raise DestinationConnectionError("Astra DB collection not available for insertion.")
         astra_db_batch_size = self.write_config.batch_size
         for batch in batch_generator(elements_dict, astra_db_batch_size):

unstructured_ingest/v2/processes/chunker.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional
@@ -9,6 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces.process import BaseProcess
 from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.unstructured_api import call_api
 CHUNK_MAX_CHARS_DEFAULT: int = 500
 CHUNK_MULTI_PAGE_DEFAULT: bool = True
@@ -111,35 +112,13 @@ class Chunker(BaseProcess, ABC):
     @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
     async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
-        from unstructured_client import UnstructuredClient
-        from unstructured_client.models.operations import PartitionRequest
-        from unstructured_client.models.shared import Files, PartitionParameters
-        client = UnstructuredClient(
-            api_key_auth=self.config.chunk_api_key.get_secret_value(),
+        elements = await call_api(
             server_url=self.config.chunking_endpoint,
+            api_key=self.config.chunk_api_key.get_secret_value(),
+            filename=elements_filepath,
+            api_parameters=self.config.to_chunking_kwargs(),
         )
-        partition_request = self.config.to_chunking_kwargs()
-        possible_fields = [f.name for f in fields(PartitionParameters)]
-        filtered_partition_request = {
-            k: v for k, v in partition_request.items() if k in possible_fields
-        }
-        if len(filtered_partition_request) != len(partition_request):
-            logger.debug(
-                "Following fields were omitted due to not being "
-                "supported by the currently used unstructured client: {}".format(
-                    ", ".join([v for v in partition_request if v not in filtered_partition_request])
-                )
-            )
-        with open(elements_filepath, "rb") as f:
-            files = Files(
-                content=f.read(),
-                file_name=str(elements_filepath.resolve()),
-            )
-            filtered_partition_request["files"] = files
-            partition_params = PartitionParameters(**filtered_partition_request)
-            partition_request_obj = PartitionRequest(partition_params)
-        resp = client.general.partition(partition_request_obj)
-        elements = resp.elements or []
         elements = assign_and_map_hash_ids(elements=elements)
         return elements

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -25,7 +25,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 if TYPE_CHECKING:
-    from astrapy.db import AstraDBCollection
+    from astrapy import Collection as AstraDBCollection
 CONNECTOR_TYPE = "astradb"
@@ -85,7 +86,12 @@ class AstraDBUploaderConfig(UploaderConfig):
     embedding_dimension: int = Field(
         default=384, description="The dimensionality of the embeddings"
     )
-    namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
+    keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
+    namespace: Optional[str] = Field(
+        default=None,
+        description="The Astra DB connection namespace.",
+        deprecated="Please use 'keyspace' instead.",
+    )
     requested_indexing_policy: Optional[dict[str, Any]] = Field(
         default=None,
         description="The indexing policy to use for the collection.",
@@ -109,33 +115,34 @@ class AstraDBUploader(Uploader):
     @requires_dependencies(["astrapy"], extras="astradb")
     def get_collection(self) -> "AstraDBCollection":
-        from astrapy.db import AstraDB
+        from astrapy import DataAPIClient as AstraDBClient
-        # Get the collection_name and embedding dimension
-        collection_name = self.upload_config.collection_name
-        embedding_dimension = self.upload_config.embedding_dimension
-        requested_indexing_policy = self.upload_config.requested_indexing_policy
+        # Choose keyspace or deprecated namespace
+        keyspace_param = self.upload_config.keyspace or self.upload_config.namespace
-        # If the user has requested an indexing policy, pass it to the Astra DB
-        options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
+        # Get the collection_name
+        collection_name = self.upload_config.collection_name
         # Build the Astra DB object.
-        # caller_name/version for AstraDB tracking
         access_configs = self.connection_config.access_config.get_secret_value()
-        astra_db = AstraDB(
-            api_endpoint=access_configs.api_endpoint,
-            token=access_configs.token,
-            namespace=self.upload_config.namespace,
+        # Create a client object to interact with the Astra DB
+        # caller_name/version for Astra DB tracking
+        my_client = AstraDBClient(
             caller_name=integration_name,
             caller_version=integration_version,
         )
-        # Create and connect to the newly created collection
-        astra_db_collection = astra_db.create_collection(
-            collection_name=collection_name,
-            dimension=embedding_dimension,
-            options=options,
+        # Get the database object
+        astra_db = my_client.get_database(
+            api_endpoint=access_configs.api_endpoint,
+            token=access_configs.token,
+            keyspace=keyspace_param,
         )
+        # Connect to the newly created collection
+        astra_db_collection = astra_db.get_collection(name=collection_name)
         return astra_db_collection
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/databricks_volumes.py CHANGED Viewed

@@ -42,8 +42,10 @@ class DatabricksVolumesAccessConfig(AccessConfig):
         description="The Databricks password part of basic authentication. "
         "Only possible when Host is *.cloud.databricks.com (AWS).",
     )
-    client_id: Optional[str] = Field(default=None)
-    client_secret: Optional[str] = Field(default=None)
+    client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
+    client_secret: Optional[str] = Field(
+        default=None, description="Client Secret of the OAuth app."
+    )
     token: Optional[str] = Field(
         default=None,
         description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
@@ -140,11 +142,12 @@ class DatabricksVolumesUploader(Uploader):
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         output_path = os.path.join(self.upload_config.path, path.name)
-        self.get_client().files.upload(
-            file_path=output_path,
-            contents=path,
-            overwrite=self.upload_config.overwrite,
-        )
+        with open(path, "rb") as elements_file:
+            self.get_client().files.upload(
+                file_path=output_path,
+                contents=elements_file,
+                overwrite=self.upload_config.overwrite,
+            )
 databricks_volumes_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/partitioner.py CHANGED Viewed

@@ -1,8 +1,7 @@
-import asyncio
 from abc import ABC
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional
+from typing import Any, Optional
 from pydantic import BaseModel, Field, SecretStr
@@ -10,11 +9,7 @@ from unstructured_ingest.utils.data_prep import flatten_dict
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces.process import BaseProcess
 from unstructured_ingest.v2.logger import logger
-if TYPE_CHECKING:
-    from unstructured_client import UnstructuredClient
-    from unstructured_client.models.operations import PartitionRequest
-    from unstructured_client.models.shared import PartitionParameters
+from unstructured_ingest.v2.unstructured_api import call_api
 class PartitionerConfig(BaseModel):
@@ -154,60 +149,19 @@ class Partitioner(BaseProcess, ABC):
         )
         return self.postprocess(elements=elements_to_dicts(elements))
-    async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
-        # TODO when client supports async, run without using run_in_executor
-        # isolate the IO heavy call
-        loop = asyncio.get_event_loop()
-        return await loop.run_in_executor(None, client.general.partition, request)
-    def create_partition_parameters(self, filename: Path) -> "PartitionParameters":
-        from unstructured_client.models.shared import Files, PartitionParameters
-        partition_request = self.config.to_partition_kwargs()
-        # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
-        # Prior to this it was a dataclass which doesn't have .__fields
-        try:
-            possible_fields = PartitionParameters.__fields__
-        except AttributeError:
-            possible_fields = [f.name for f in fields(PartitionParameters)]
-        filtered_partition_request = {
-            k: v for k, v in partition_request.items() if k in possible_fields
-        }
-        if len(filtered_partition_request) != len(partition_request):
-            logger.debug(
-                "Following fields were omitted due to not being "
-                "supported by the currently used unstructured client: {}".format(
-                    ", ".join([v for v in partition_request if v not in filtered_partition_request])
-                )
-            )
-        logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
-        with open(filename, "rb") as f:
-            files = Files(
-                content=f.read(),
-                file_name=str(filename.resolve()),
-            )
-            filtered_partition_request["files"] = files
-        partition_params = PartitionParameters(**filtered_partition_request)
-        return partition_params
     @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
     async def partition_via_api(
         self, filename: Path, metadata: Optional[dict] = None, **kwargs
     ) -> list[dict]:
-        from unstructured_client import UnstructuredClient
-        from unstructured_client.models.operations import PartitionRequest
         logger.debug(f"partitioning file {filename} with metadata: {metadata}")
-        client = UnstructuredClient(
+        elements = await call_api(
             server_url=self.config.partition_endpoint,
-            api_key_auth=self.config.api_key.get_secret_value(),
+            api_key=self.config.api_key.get_secret_value(),
+            filename=filename,
+            api_parameters=self.config.to_partition_kwargs(),
         )
-        partition_params = self.create_partition_parameters(filename=filename)
-        partition_request = PartitionRequest(partition_params)
-        resp = await self.call_api(client=client, request=partition_request)
-        elements = resp.elements or []
         # Append the data source metadata the auto partition does for you
         for element in elements:
             element["metadata"]["data_source"] = metadata

unstructured_ingest/v2/unstructured_api.py ADDED Viewed

@@ -0,0 +1,87 @@
+import asyncio
+from dataclasses import fields
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+from unstructured_ingest.v2.logger import logger
+if TYPE_CHECKING:
+    from unstructured_client.models.operations import PartitionRequest
+def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
+    """Given a filename and a dict of API parameters, return a PartitionRequest for use
+    by unstructured-client. Remove any params that aren't recognized by the SDK.
+    Args:
+        filename: Path to the file being partitioned
+        parameters_dict: A mapping of all API params we want to send
+    Returns: A PartitionRequest containing the file and all valid params
+    """
+    from unstructured_client.models.operations import PartitionRequest
+    from unstructured_client.models.shared import Files, PartitionParameters
+    # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
+    # Prior to this it was a dataclass which doesn't have .__fields
+    try:
+        possible_fields = PartitionParameters.__fields__
+    except AttributeError:
+        possible_fields = [f.name for f in fields(PartitionParameters)]
+    filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
+    if len(filtered_partition_request) != len(parameters_dict):
+        logger.debug(
+            "Following fields were omitted due to not being "
+            "supported by the currently used unstructured client: {}".format(
+                ", ".join([v for v in parameters_dict if v not in filtered_partition_request])
+            )
+        )
+    logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
+    with open(filename, "rb") as f:
+        files = Files(
+            content=f.read(),
+            file_name=str(filename.resolve()),
+        )
+        filtered_partition_request["files"] = files
+    partition_params = PartitionParameters(**filtered_partition_request)
+    return PartitionRequest(partition_parameters=partition_params)
+async def call_api(
+    server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
+) -> list[dict]:
+    """Call the Unstructured API using unstructured-client.
+    Args:
+        server_url: The base URL where the API is hosted
+        api_key: The user's API key (can be empty if this is a self hosted API)
+        filename: Path to the file being partitioned
+        api_parameters: A dict containing the requested API parameters
+    Returns: A list of the file's elements, or an empty list if there was an error
+    """
+    from unstructured_client import UnstructuredClient
+    client = UnstructuredClient(
+        server_url=server_url,
+        api_key_auth=api_key,
+    )
+    partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
+    # TODO when client supports async, run without using run_in_executor
+    # isolate the IO heavy call
+    loop = asyncio.get_event_loop()
+    # Note(austin) - The partition calls needs request to be a keyword arg
+    # We have to use partial to do this, we can't pass request=request into run_in_executor
+    partition_call = partial(client.general.partition, request=partition_request)
+    res = await loop.run_in_executor(None, partition_call)
+    return res.elements or []

unstructured-ingest 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl