PyPI - unstructured-ingest - Versions diffs - 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/README.md +28 -0
unstructured_ingest/embed/mixedbreadai.py +0 -1
unstructured_ingest/interfaces/upload_stager.py +2 -2
unstructured_ingest/interfaces/uploader.py +3 -3
unstructured_ingest/main.py +0 -0
unstructured_ingest/pipeline/interfaces.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/processes/chunker.py +4 -0
unstructured_ingest/processes/connectors/airtable.py +4 -2
unstructured_ingest/processes/connectors/astradb.py +48 -34
unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
unstructured_ingest/processes/connectors/confluence.py +0 -1
unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
unstructured_ingest/processes/connectors/delta_table.py +1 -0
unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
unstructured_ingest/processes/connectors/gitlab.py +1 -2
unstructured_ingest/processes/connectors/google_drive.py +0 -2
unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
unstructured_ingest/processes/connectors/kdbai.py +1 -0
unstructured_ingest/processes/connectors/outlook.py +1 -2
unstructured_ingest/processes/connectors/pinecone.py +0 -1
unstructured_ingest/processes/connectors/redisdb.py +28 -24
unstructured_ingest/processes/connectors/salesforce.py +1 -1
unstructured_ingest/processes/connectors/slack.py +1 -2
unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
unstructured_ingest/processes/connectors/sql/sql.py +3 -4
unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
unstructured_ingest/processes/connectors/vectara.py +0 -2
unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
unstructured_ingest/processes/embedder.py +2 -2
unstructured_ingest/processes/filter.py +1 -1
unstructured_ingest/processes/partitioner.py +4 -0
unstructured_ingest/processes/utils/blob_storage.py +2 -2
unstructured_ingest/unstructured_api.py +13 -8
unstructured_ingest/utils/data_prep.py +8 -32
unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
examples/__init__.py +0 -0
examples/airtable.py +0 -44
examples/azure_cognitive_search.py +0 -55
examples/chroma.py +0 -54
examples/couchbase.py +0 -55
examples/databricks_volumes_dest.py +0 -55
examples/databricks_volumes_source.py +0 -53
examples/delta_table.py +0 -45
examples/discord_example.py +0 -36
examples/elasticsearch.py +0 -49
examples/google_drive.py +0 -45
examples/kdbai.py +0 -54
examples/local.py +0 -36
examples/milvus.py +0 -44
examples/mongodb.py +0 -53
examples/opensearch.py +0 -50
examples/pinecone.py +0 -57
examples/s3.py +0 -38
examples/salesforce.py +0 -44
examples/sharepoint.py +0 -47
examples/singlestore.py +0 -49
examples/sql.py +0 -90
examples/vectara.py +0 -54
examples/weaviate.py +0 -44
test/__init__.py +0 -0
test/integration/__init__.py +0 -0
test/integration/chunkers/__init__.py +0 -0
test/integration/chunkers/test_chunkers.py +0 -31
test/integration/connectors/__init__.py +0 -0
test/integration/connectors/conftest.py +0 -38
test/integration/connectors/databricks/__init__.py +0 -0
test/integration/connectors/databricks/test_volumes_native.py +0 -273
test/integration/connectors/discord/__init__.py +0 -0
test/integration/connectors/discord/test_discord.py +0 -90
test/integration/connectors/duckdb/__init__.py +0 -0
test/integration/connectors/duckdb/conftest.py +0 -14
test/integration/connectors/duckdb/test_duckdb.py +0 -90
test/integration/connectors/duckdb/test_motherduck.py +0 -95
test/integration/connectors/elasticsearch/__init__.py +0 -0
test/integration/connectors/elasticsearch/conftest.py +0 -34
test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
test/integration/connectors/sql/__init__.py +0 -0
test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
test/integration/connectors/sql/test_postgres.py +0 -201
test/integration/connectors/sql/test_singlestore.py +0 -182
test/integration/connectors/sql/test_snowflake.py +0 -244
test/integration/connectors/sql/test_sqlite.py +0 -168
test/integration/connectors/sql/test_vastdb.py +0 -34
test/integration/connectors/test_astradb.py +0 -287
test/integration/connectors/test_azure_ai_search.py +0 -254
test/integration/connectors/test_chroma.py +0 -136
test/integration/connectors/test_confluence.py +0 -111
test/integration/connectors/test_delta_table.py +0 -183
test/integration/connectors/test_dropbox.py +0 -151
test/integration/connectors/test_github.py +0 -49
test/integration/connectors/test_google_drive.py +0 -257
test/integration/connectors/test_jira.py +0 -67
test/integration/connectors/test_lancedb.py +0 -247
test/integration/connectors/test_milvus.py +0 -208
test/integration/connectors/test_mongodb.py +0 -335
test/integration/connectors/test_neo4j.py +0 -244
test/integration/connectors/test_notion.py +0 -152
test/integration/connectors/test_onedrive.py +0 -163
test/integration/connectors/test_pinecone.py +0 -387
test/integration/connectors/test_qdrant.py +0 -216
test/integration/connectors/test_redis.py +0 -143
test/integration/connectors/test_s3.py +0 -184
test/integration/connectors/test_sharepoint.py +0 -222
test/integration/connectors/test_vectara.py +0 -282
test/integration/connectors/test_zendesk.py +0 -120
test/integration/connectors/utils/__init__.py +0 -0
test/integration/connectors/utils/constants.py +0 -13
test/integration/connectors/utils/docker.py +0 -151
test/integration/connectors/utils/docker_compose.py +0 -59
test/integration/connectors/utils/validation/__init__.py +0 -0
test/integration/connectors/utils/validation/destination.py +0 -77
test/integration/connectors/utils/validation/equality.py +0 -76
test/integration/connectors/utils/validation/source.py +0 -331
test/integration/connectors/utils/validation/utils.py +0 -36
test/integration/connectors/weaviate/__init__.py +0 -0
test/integration/connectors/weaviate/conftest.py +0 -15
test/integration/connectors/weaviate/test_cloud.py +0 -39
test/integration/connectors/weaviate/test_local.py +0 -152
test/integration/embedders/__init__.py +0 -0
test/integration/embedders/conftest.py +0 -13
test/integration/embedders/test_azure_openai.py +0 -57
test/integration/embedders/test_bedrock.py +0 -103
test/integration/embedders/test_huggingface.py +0 -24
test/integration/embedders/test_mixedbread.py +0 -71
test/integration/embedders/test_octoai.py +0 -75
test/integration/embedders/test_openai.py +0 -74
test/integration/embedders/test_togetherai.py +0 -71
test/integration/embedders/test_vertexai.py +0 -63
test/integration/embedders/test_voyageai.py +0 -79
test/integration/embedders/utils.py +0 -66
test/integration/partitioners/__init__.py +0 -0
test/integration/partitioners/test_partitioner.py +0 -76
test/integration/utils.py +0 -15
test/unit/__init__.py +0 -0
test/unit/chunkers/__init__.py +0 -0
test/unit/chunkers/test_chunkers.py +0 -49
test/unit/connectors/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
test/unit/connectors/motherduck/__init__.py +0 -0
test/unit/connectors/motherduck/test_base.py +0 -73
test/unit/connectors/sql/__init__.py +0 -0
test/unit/connectors/sql/test_sql.py +0 -152
test/unit/connectors/test_confluence.py +0 -71
test/unit/connectors/test_jira.py +0 -401
test/unit/embed/__init__.py +0 -0
test/unit/embed/test_mixedbreadai.py +0 -42
test/unit/embed/test_octoai.py +0 -27
test/unit/embed/test_openai.py +0 -28
test/unit/embed/test_vertexai.py +0 -25
test/unit/embed/test_voyageai.py +0 -24
test/unit/embedders/__init__.py +0 -0
test/unit/embedders/test_bedrock.py +0 -36
test/unit/embedders/test_huggingface.py +0 -48
test/unit/embedders/test_mixedbread.py +0 -37
test/unit/embedders/test_octoai.py +0 -35
test/unit/embedders/test_openai.py +0 -35
test/unit/embedders/test_togetherai.py +0 -37
test/unit/embedders/test_vertexai.py +0 -37
test/unit/embedders/test_voyageai.py +0 -38
test/unit/partitioners/__init__.py +0 -0
test/unit/partitioners/test_partitioner.py +0 -63
test/unit/test_error.py +0 -27
test/unit/test_html.py +0 -112
test/unit/test_interfaces.py +0 -26
test/unit/test_utils.py +0 -220
test/unit/utils/__init__.py +0 -0
test/unit/utils/data_generator.py +0 -32
unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.7.2" # pragma: no cover
1	+ __version__ = "1.0.2" # pragma: no cover

unstructured_ingest/cli/README.md ADDED Viewed

@@ -0,0 +1,28 @@
+# Ingest CLI
+This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
+## Design Reference
+[cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
+source and destination connectors.
+To manually run the cli:
+```shell
+PYTHONPATH=. python unstructured_ingest/v2/main.py --help
+```
+The `main.py` file simply wraps the generated Click command created in `cli.py`.
+### Source Commands
+All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
+different connectors with shared and unique parameters.
+### Destination Commands
+All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
+sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
+generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as  `Destinations`).
+### Configs
+The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
+are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
+Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
+into dataclasses that have nested fields (such as access configs).

unstructured_ingest/embed/mixedbreadai.py CHANGED Viewed

@@ -114,7 +114,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
 @dataclass
 class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     config: MixedbreadAIEmbeddingConfig
     async def get_exemplary_embedding(self) -> list[float]:

unstructured_ingest/interfaces/upload_stager.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import BaseModel
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import BaseProcess
 from unstructured_ingest.utils import ndjson
-from unstructured_ingest.utils.data_prep import get_data, write_data
+from unstructured_ingest.utils.data_prep import get_json_data, write_data
 class UploadStagerConfig(BaseModel):
@@ -43,7 +43,7 @@ class UploadStager(BaseProcess, ABC):
                     writer.f.flush()
     def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
-        elements_contents = get_data(path=input_file)
+        elements_contents = get_json_data(path=input_file)
         conformed_elements = [
             self.conform_dict(element_dict=element, file_data=file_data)

unstructured_ingest/interfaces/uploader.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pydantic import BaseModel
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import BaseConnector, BaseProcess
-from unstructured_ingest.utils.data_prep import get_data
+from unstructured_ingest.utils.data_prep import get_json_data
 class UploaderConfig(BaseModel):
@@ -45,11 +45,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
         return False
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         self.run_data(data=data, file_data=file_data, **kwargs)
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         await self.run_data_async(data=data, file_data=file_data, **kwargs)
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/main.py CHANGED Viewed

File without changes

unstructured_ingest/pipeline/interfaces.py CHANGED Viewed

@@ -119,7 +119,7 @@ class PipelineStep(ABC):
         iterable = iterable or []
         if iterable:
             logger.info(
-                f"calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+                f"calling {self.__class__.__name__} with {len(iterable)} docs",  # type: ignore
             )
         else:
             logger.info(f"calling {self.__class__.__name__} with no inputs")

unstructured_ingest/pipeline/pipeline.py CHANGED Viewed

@@ -220,7 +220,7 @@ class Pipeline:
     def _run(self):
         logger.info(
-            f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
+            f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
         )
         if self.context.mp_supported:
             manager = mp.Manager()

unstructured_ingest/processes/chunker.py CHANGED Viewed

@@ -24,6 +24,9 @@ class ChunkerConfig(BaseModel):
         default="https://api.unstructuredapp.io/general/v0/general",
         description="If chunking via api, use the following host.",
     )
+    chunk_api_timeout_ms: Optional[int] = Field(
+        default=None, description="Timeout in milliseconds for all api call during chunking."
+    )
     chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
     chunk_api_key: Optional[SecretStr] = Field(
         default=None, description="API Key for chunking endpoint."
@@ -120,6 +123,7 @@ class Chunker(BaseProcess, ABC):
             api_key=self.config.chunk_api_key.get_secret_value(),
             filename=elements_filepath,
             api_parameters=self.config.to_chunking_kwargs(),
+            timeout_ms=self.config.chunk_api_timeout_ms,
         )
         elements = assign_and_map_hash_ids(elements=elements)

unstructured_ingest/processes/connectors/airtable.py CHANGED Viewed

@@ -3,7 +3,6 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from uuid import NAMESPACE_DNS, uuid5
-import pandas
 from pydantic import BaseModel, Field, Secret, field_validator
 from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
@@ -213,10 +212,13 @@ class AirtableDownloader(Downloader):
         row_dict.update(table_row["fields"])
         return row_dict
+    @requires_dependencies(["pandas"], extras="airtable")
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
+        import pandas as pd
         table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
         table_contents = self.get_table_contents(table_meta=table_meta)
-        df = pandas.DataFrame.from_dict(
+        df = pd.DataFrame.from_dict(
             data=[self._table_row_to_dict(table_row=row) for row in table_contents]
         ).sort_index(axis=1)
         download_path = self.get_download_path(file_data=file_data)

unstructured_ingest/processes/connectors/astradb.py CHANGED Viewed

@@ -1,5 +1,7 @@
+import asyncio
 import csv
 import hashlib
+import os
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -8,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import BaseModel, Field, Secret
-from unstructured_ingest import __name__ as integration_name
 from unstructured_ingest.__version__ import __version__ as integration_version
 from unstructured_ingest.data_types.file_data import (
     BatchFileData,
@@ -43,7 +44,7 @@ from unstructured_ingest.processes.connector_registry import (
 )
 from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
 from unstructured_ingest.utils.constants import RECORD_ID_LABEL
-from unstructured_ingest.utils.data_prep import batch_generator, get_data
+from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
@@ -83,10 +84,8 @@ class AstraDBConnectionConfig(ConnectionConfig):
         # Create a client object to interact with the Astra DB
         # caller_name/version for Astra DB tracking
-        return AstraDBClient(
-            caller_name=integration_name,
-            caller_version=integration_version,
-        )
+        user_agent = os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
+        return AstraDBClient(callers=[(user_agent, integration_version)])
 def get_astra_db(
@@ -141,7 +140,7 @@ async def get_async_astra_collection(
     )
     # Get async collection from AsyncDatabase
-    async_astra_db_collection = await async_astra_db.get_collection(name=collection_name)
+    async_astra_db_collection = async_astra_db.get_collection(name=collection_name)
     return async_astra_db_collection
@@ -360,13 +359,22 @@ class AstraDBUploader(Uploader):
     upload_config: AstraDBUploaderConfig
     connector_type: str = CONNECTOR_TYPE
+    def is_async(self) -> bool:
+        return True
     def init(self, **kwargs: Any) -> None:
         self.create_destination(**kwargs)
+    @requires_dependencies(["astrapy"], extras="astradb")
     def precheck(self) -> None:
         try:
             if self.upload_config.collection_name:
-                self.get_collection(collection_name=self.upload_config.collection_name).options()
+                collection = get_astra_collection(
+                    connection_config=self.connection_config,
+                    collection_name=self.upload_config.collection_name,
+                    keyspace=self.upload_config.keyspace,
+                )
+                collection.options()
             else:
                 # check for db connection only if collection name is not provided
                 get_astra_db(
@@ -377,17 +385,7 @@ class AstraDBUploader(Uploader):
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    @requires_dependencies(["astrapy"], extras="astradb")
-    def get_collection(self, collection_name: Optional[str] = None) -> "AstraDBCollection":
-        return get_astra_collection(
-            connection_config=self.connection_config,
-            collection_name=collection_name or self.upload_config.collection_name,
-            keyspace=self.upload_config.keyspace,
-        )
     def _collection_exists(self, collection_name: str):
-        from astrapy.exceptions import CollectionNotFoundException
         collection = get_astra_collection(
             connection_config=self.connection_config,
             collection_name=collection_name,
@@ -397,8 +395,10 @@ class AstraDBUploader(Uploader):
         try:
             collection.options()
             return True
-        except CollectionNotFoundException:
-            return False
+        except RuntimeError as e:
+            if "not found" in str(e):
+                return False
+            raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
         except Exception as e:
             logger.error(f"failed to check if astra collection exists : {e}")
             raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
@@ -422,6 +422,8 @@ class AstraDBUploader(Uploader):
         self.upload_config.collection_name = collection_name
         if not self._collection_exists(collection_name):
+            from astrapy.info import CollectionDefinition
             astra_db = get_astra_db(
                 connection_config=self.connection_config, keyspace=self.upload_config.keyspace
             )
@@ -429,44 +431,56 @@ class AstraDBUploader(Uploader):
                 f"creating default astra collection '{collection_name}' with dimension "
                 f"{vector_length} and metric {similarity_metric}"
             )
-            astra_db.create_collection(
-                collection_name,
-                dimension=vector_length,
-                metric=similarity_metric,
+            definition = (
+                CollectionDefinition.builder()
+                .set_vector_dimension(dimension=vector_length)
+                .set_vector_metric(similarity_metric)
+                .build()
             )
+            (astra_db.create_collection(collection_name, definition=definition),)
             return True
         logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
         return False
-    def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
+    async def delete_by_record_id(self, collection: "AstraDBAsyncCollection", file_data: FileData):
         logger.debug(
             f"deleting records from collection {collection.name} "
             f"with {self.upload_config.record_id_key} "
             f"set to {file_data.identifier}"
         )
         delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
-        delete_resp = collection.delete_many(filter=delete_filter)
+        delete_resp = await collection.delete_many(filter=delete_filter)
         logger.debug(
             f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
         )
-    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+    async def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
         logger.info(
             f"writing {len(data)} objects to destination "
             f"collection {self.upload_config.collection_name}"
         )
         astra_db_batch_size = self.upload_config.batch_size
-        collection = self.get_collection()
+        async_astra_collection = await get_async_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=self.upload_config.collection_name,
+            keyspace=self.upload_config.keyspace,
+        )
-        self.delete_by_record_id(collection=collection, file_data=file_data)
+        await self.delete_by_record_id(collection=async_astra_collection, file_data=file_data)
+        await asyncio.gather(
+            *[
+                async_astra_collection.insert_many(chunk)
+                for chunk in batch_generator(data, astra_db_batch_size)
+            ]
+        )
-        for chunk in batch_generator(data, astra_db_batch_size):
-            collection.insert_many(chunk)
+    async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        data = get_json_data(path=path)
+        await self.run_data(data=data, file_data=file_data)
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
-        self.run_data(data=data, file_data=file_data, **kwargs)
+    def run(self, **kwargs: Any) -> Any:
+        raise NotImplementedError("Use astradb run_async instead")
 astra_db_source_entry = SourceRegistryEntry(

unstructured_ingest/processes/connectors/azure_ai_search.py CHANGED Viewed

@@ -212,7 +212,7 @@ class AzureAISearchUploader(Uploader):
             raise WriteError(
                 ", ".join(
                     [
-                        f"{error.key}: " f"[{error.status_code}] {error.error_message}"
+                        f"{error.key}: [{error.status_code}] {error.error_message}"
                         for error in errors
                     ],
                 ),

unstructured_ingest/processes/connectors/confluence.py CHANGED Viewed

@@ -125,7 +125,6 @@ class ConfluenceIndexer(Indexer):
     def precheck(self) -> bool:
         try:
             # Attempt to retrieve a list of spaces with limit=1.
             # This should only succeed if all creds are valid
             with self.connection_config.get_client() as client:

unstructured_ingest/processes/connectors/databricks/volumes_aws.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_aws"
 class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint",
+        description="The Databricks account ID for the Databricks accounts endpoint",
     )
     profile: Optional[str] = None
     token: Optional[str] = Field(

unstructured_ingest/processes/connectors/databricks/volumes_azure.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_azure"
 class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint.",
+        description="The Databricks account ID for the Databricks accounts endpoint.",
     )
     profile: Optional[str] = None
     azure_workspace_resource_id: Optional[str] = Field(
@@ -47,7 +47,7 @@ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
     )
     azure_environment: Optional[str] = Field(
         default=None,
-        description="The Azure environment type for a " "specific set of API endpoints",
+        description="The Azure environment type for a specific set of API endpoints",
         examples=["Public", "UsGov", "China", "Germany"],
     )

unstructured_ingest/processes/connectors/databricks/volumes_gcp.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_gcp"
 class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint.",
+        description="The Databricks account ID for the Databricks accounts endpoint.",
     )
     profile: Optional[str] = None
     google_credentials: Optional[str] = None

unstructured_ingest/processes/connectors/databricks/volumes_table.py CHANGED Viewed

@@ -166,8 +166,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
             logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
             cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
             logger.debug(
-                f"migrating content from {catalog_path} to "
-                f"table {self.upload_config.table_name}"
+                f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
             )
             data = get_json_data(path=path)
             columns = data[0].keys()

unstructured_ingest/processes/connectors/delta_table.py CHANGED Viewed

@@ -181,6 +181,7 @@ class DeltaTableUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df, file_data=file_data)
+    @requires_dependencies(["pandas"], extras="delta-table")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/duckdb/base.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import UploadStager
-from unstructured_ingest.utils.data_prep import get_data, get_enhanced_element_id, write_data
+from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 _COLUMNS = (
@@ -81,7 +81,7 @@ class BaseDuckDBUploadStager(UploadStager):
     ) -> Path:
         import pandas as pd
-        elements_contents = get_data(path=elements_filepath)
+        elements_contents = get_json_data(path=elements_filepath)
         output_filename_suffix = Path(elements_filepath).suffix
         output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
         output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)

unstructured_ingest/processes/connectors/duckdb/duckdb.py CHANGED Viewed

@@ -67,9 +67,8 @@ class DuckDBConnectionConfig(ConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
-        with self.get_client() as client:
-            with client.cursor() as cursor:
-                yield cursor
+        with self.get_client() as client, client.cursor() as cursor:
+            yield cursor
 class DuckDBUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class DuckDBUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="duckdb")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df)

unstructured_ingest/processes/connectors/duckdb/motherduck.py CHANGED Viewed

@@ -66,9 +66,8 @@ class MotherDuckConnectionConfig(ConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
-        with self.get_client() as client:
-            with client.cursor() as cursor:
-                yield cursor
+        with self.get_client() as client, client.cursor() as cursor:
+            yield cursor
 class MotherDuckUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class MotherDuckUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="duckdb")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df)

unstructured_ingest/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -134,9 +134,11 @@ class S3Indexer(FsspecIndexer):
         version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
         metadata: dict[str, str] = {}
-        with contextlib.suppress(AttributeError):
-            with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
-                metadata = client.metadata(path=path)
+        with (
+            contextlib.suppress(AttributeError),
+            self.connection_config.get_client(protocol=self.index_config.protocol) as client,
+        ):
+            metadata = client.metadata(path=path)
         record_locator = {
             "protocol": self.index_config.protocol,
             "remote_file_path": self.index_config.remote_url,

unstructured_ingest/processes/connectors/gitlab.py CHANGED Viewed

@@ -230,8 +230,7 @@ class GitLabDownloader(Downloader):
         download_path = self.get_download_path(file_data=file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/google_drive.py CHANGED Viewed

@@ -334,7 +334,6 @@ class GoogleDriveIndexer(Indexer):
         recursive: bool = False,
         previous_path: Optional[str] = None,
     ) -> list[dict]:
         fields_input = "nextPageToken, files({})".format(",".join(self.fields))
         q = f"'{object_id}' in parents"
         # Filter by extension but still include any directories
@@ -394,7 +393,6 @@ class GoogleDriveIndexer(Indexer):
         if not self.is_dir(root_info):
             data = [self.map_file_data(root_info)]
         else:
             file_contents = self.get_paginated_results(
                 files_client=files_client,
                 object_id=object_id,

unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py CHANGED Viewed

@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
-import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.data_types.file_data import FileData
@@ -29,6 +28,7 @@ from unstructured_ingest.utils.data_prep import get_data_df
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
+    from pandas import DataFrame
     from pyarrow import Table as ArrowTable
     from pyiceberg.catalog.rest import RestCatalog
     from pyiceberg.table import Table, Transaction
@@ -96,14 +96,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
             return UserAuthError(e)
         if 400 <= response_code < 500:
             logger.error(
-                f"Request to {url} failed"
-                f"in IBM watsonx.data connector, status code {response_code}"
+                f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
             )
             return UserError(e)
         if response_code > 500:
             logger.error(
-                f"Request to {url} failed"
-                f"in IBM watsonx.data connector, status code {response_code}"
+                f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
             )
             return ProviderError(e)
         logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
@@ -217,7 +215,7 @@ class IbmWatsonxUploader(SQLUploader):
         return self.upload_config.record_id_key in self.get_table_columns()
     @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
-    def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
+    def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
         import pyarrow as pa
         # Iceberg will automatically fill missing columns with nulls
@@ -277,16 +275,20 @@ class IbmWatsonxUploader(SQLUploader):
         except Exception as e:
             raise ProviderError(f"Failed to upload data to table: {e}")
-    def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
+    def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
         data_table = self._df_to_arrow_table(df)
         with self.get_table() as table:
             self.upload_data_table(table, data_table, file_data)
+    @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        import pandas as pd
         df = pd.DataFrame(data)
         self.upload_dataframe(df=df, file_data=file_data)
+    @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path=path)
         self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/kdbai.py CHANGED Viewed

@@ -141,6 +141,7 @@ class KdbaiUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.process_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="kdbai")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         data = get_data_df(path=path)
         self.process_dataframe(df=data)

unstructured_ingest/processes/connectors/outlook.py CHANGED Viewed

@@ -199,8 +199,7 @@ class OutlookDownloader(Downloader):
         download_path = self.get_download_path(file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/pinecone.py CHANGED Viewed

@@ -227,7 +227,6 @@ class PineconeUploader(VectorDBUploader):
         self.connection_config.index_name = index_name
         if not self.index_exists(index_name):
             logger.info(f"creating pinecone index {index_name}")
             pc = self.connection_config.get_client()

unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.2py3-none-any.whl