PyPI - unstructured-ingest - Versions diffs - 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/README.md +28 -0
unstructured_ingest/embed/mixedbreadai.py +0 -1
unstructured_ingest/interfaces/upload_stager.py +2 -2
unstructured_ingest/interfaces/uploader.py +3 -3
unstructured_ingest/main.py +0 -0
unstructured_ingest/pipeline/interfaces.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/processes/chunker.py +4 -0
unstructured_ingest/processes/connectors/airtable.py +4 -2
unstructured_ingest/processes/connectors/astradb.py +2 -2
unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
unstructured_ingest/processes/connectors/confluence.py +0 -1
unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
unstructured_ingest/processes/connectors/delta_table.py +1 -0
unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
unstructured_ingest/processes/connectors/gitlab.py +1 -2
unstructured_ingest/processes/connectors/google_drive.py +0 -2
unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
unstructured_ingest/processes/connectors/kdbai.py +1 -0
unstructured_ingest/processes/connectors/outlook.py +1 -2
unstructured_ingest/processes/connectors/pinecone.py +0 -1
unstructured_ingest/processes/connectors/redisdb.py +28 -24
unstructured_ingest/processes/connectors/salesforce.py +1 -1
unstructured_ingest/processes/connectors/slack.py +1 -2
unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
unstructured_ingest/processes/connectors/sql/sql.py +3 -4
unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
unstructured_ingest/processes/connectors/vectara.py +0 -2
unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
unstructured_ingest/processes/embedder.py +2 -2
unstructured_ingest/processes/filter.py +1 -1
unstructured_ingest/processes/partitioner.py +4 -0
unstructured_ingest/processes/utils/blob_storage.py +2 -2
unstructured_ingest/unstructured_api.py +13 -8
unstructured_ingest/utils/data_prep.py +8 -32
unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
examples/__init__.py +0 -0
examples/airtable.py +0 -44
examples/azure_cognitive_search.py +0 -55
examples/chroma.py +0 -54
examples/couchbase.py +0 -55
examples/databricks_volumes_dest.py +0 -55
examples/databricks_volumes_source.py +0 -53
examples/delta_table.py +0 -45
examples/discord_example.py +0 -36
examples/elasticsearch.py +0 -49
examples/google_drive.py +0 -45
examples/kdbai.py +0 -54
examples/local.py +0 -36
examples/milvus.py +0 -44
examples/mongodb.py +0 -53
examples/opensearch.py +0 -50
examples/pinecone.py +0 -57
examples/s3.py +0 -38
examples/salesforce.py +0 -44
examples/sharepoint.py +0 -47
examples/singlestore.py +0 -49
examples/sql.py +0 -90
examples/vectara.py +0 -54
examples/weaviate.py +0 -44
test/__init__.py +0 -0
test/integration/__init__.py +0 -0
test/integration/chunkers/__init__.py +0 -0
test/integration/chunkers/test_chunkers.py +0 -31
test/integration/connectors/__init__.py +0 -0
test/integration/connectors/conftest.py +0 -38
test/integration/connectors/databricks/__init__.py +0 -0
test/integration/connectors/databricks/test_volumes_native.py +0 -273
test/integration/connectors/discord/__init__.py +0 -0
test/integration/connectors/discord/test_discord.py +0 -90
test/integration/connectors/duckdb/__init__.py +0 -0
test/integration/connectors/duckdb/conftest.py +0 -14
test/integration/connectors/duckdb/test_duckdb.py +0 -90
test/integration/connectors/duckdb/test_motherduck.py +0 -95
test/integration/connectors/elasticsearch/__init__.py +0 -0
test/integration/connectors/elasticsearch/conftest.py +0 -34
test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
test/integration/connectors/sql/__init__.py +0 -0
test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
test/integration/connectors/sql/test_postgres.py +0 -201
test/integration/connectors/sql/test_singlestore.py +0 -182
test/integration/connectors/sql/test_snowflake.py +0 -244
test/integration/connectors/sql/test_sqlite.py +0 -168
test/integration/connectors/sql/test_vastdb.py +0 -34
test/integration/connectors/test_astradb.py +0 -287
test/integration/connectors/test_azure_ai_search.py +0 -254
test/integration/connectors/test_chroma.py +0 -136
test/integration/connectors/test_confluence.py +0 -111
test/integration/connectors/test_delta_table.py +0 -183
test/integration/connectors/test_dropbox.py +0 -151
test/integration/connectors/test_github.py +0 -49
test/integration/connectors/test_google_drive.py +0 -257
test/integration/connectors/test_jira.py +0 -67
test/integration/connectors/test_lancedb.py +0 -247
test/integration/connectors/test_milvus.py +0 -208
test/integration/connectors/test_mongodb.py +0 -335
test/integration/connectors/test_neo4j.py +0 -244
test/integration/connectors/test_notion.py +0 -152
test/integration/connectors/test_onedrive.py +0 -163
test/integration/connectors/test_pinecone.py +0 -387
test/integration/connectors/test_qdrant.py +0 -216
test/integration/connectors/test_redis.py +0 -143
test/integration/connectors/test_s3.py +0 -184
test/integration/connectors/test_sharepoint.py +0 -222
test/integration/connectors/test_vectara.py +0 -282
test/integration/connectors/test_zendesk.py +0 -120
test/integration/connectors/utils/__init__.py +0 -0
test/integration/connectors/utils/constants.py +0 -13
test/integration/connectors/utils/docker.py +0 -151
test/integration/connectors/utils/docker_compose.py +0 -59
test/integration/connectors/utils/validation/__init__.py +0 -0
test/integration/connectors/utils/validation/destination.py +0 -77
test/integration/connectors/utils/validation/equality.py +0 -76
test/integration/connectors/utils/validation/source.py +0 -331
test/integration/connectors/utils/validation/utils.py +0 -36
test/integration/connectors/weaviate/__init__.py +0 -0
test/integration/connectors/weaviate/conftest.py +0 -15
test/integration/connectors/weaviate/test_cloud.py +0 -39
test/integration/connectors/weaviate/test_local.py +0 -152
test/integration/embedders/__init__.py +0 -0
test/integration/embedders/conftest.py +0 -13
test/integration/embedders/test_azure_openai.py +0 -57
test/integration/embedders/test_bedrock.py +0 -103
test/integration/embedders/test_huggingface.py +0 -24
test/integration/embedders/test_mixedbread.py +0 -71
test/integration/embedders/test_octoai.py +0 -75
test/integration/embedders/test_openai.py +0 -74
test/integration/embedders/test_togetherai.py +0 -71
test/integration/embedders/test_vertexai.py +0 -63
test/integration/embedders/test_voyageai.py +0 -79
test/integration/embedders/utils.py +0 -66
test/integration/partitioners/__init__.py +0 -0
test/integration/partitioners/test_partitioner.py +0 -76
test/integration/utils.py +0 -15
test/unit/__init__.py +0 -0
test/unit/chunkers/__init__.py +0 -0
test/unit/chunkers/test_chunkers.py +0 -49
test/unit/connectors/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
test/unit/connectors/motherduck/__init__.py +0 -0
test/unit/connectors/motherduck/test_base.py +0 -73
test/unit/connectors/sql/__init__.py +0 -0
test/unit/connectors/sql/test_sql.py +0 -152
test/unit/connectors/test_confluence.py +0 -71
test/unit/connectors/test_jira.py +0 -401
test/unit/embed/__init__.py +0 -0
test/unit/embed/test_mixedbreadai.py +0 -42
test/unit/embed/test_octoai.py +0 -27
test/unit/embed/test_openai.py +0 -28
test/unit/embed/test_vertexai.py +0 -25
test/unit/embed/test_voyageai.py +0 -24
test/unit/embedders/__init__.py +0 -0
test/unit/embedders/test_bedrock.py +0 -36
test/unit/embedders/test_huggingface.py +0 -48
test/unit/embedders/test_mixedbread.py +0 -37
test/unit/embedders/test_octoai.py +0 -35
test/unit/embedders/test_openai.py +0 -35
test/unit/embedders/test_togetherai.py +0 -37
test/unit/embedders/test_vertexai.py +0 -37
test/unit/embedders/test_voyageai.py +0 -38
test/unit/partitioners/__init__.py +0 -0
test/unit/partitioners/test_partitioner.py +0 -63
test/unit/test_error.py +0 -27
test/unit/test_html.py +0 -112
test/unit/test_interfaces.py +0 -26
test/unit/test_utils.py +0 -220
test/unit/utils/__init__.py +0 -0
test/unit/utils/data_generator.py +0 -32
unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.~~7.2~~" # pragma: no cover
1	+ __version__ = "1.0.1" # pragma: no cover

unstructured_ingest/cli/README.md ADDED Viewed

@@ -0,0 +1,28 @@
+# Ingest CLI
+This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
+## Design Reference
+[cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
+source and destination connectors.
+To manually run the cli:
+```shell
+PYTHONPATH=. python unstructured_ingest/v2/main.py --help
+```
+The `main.py` file simply wraps the generated Click command created in `cli.py`.
+### Source Commands
+All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
+different connectors with shared and unique parameters.
+### Destination Commands
+All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
+sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
+generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as  `Destinations`).
+### Configs
+The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
+are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
+Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
+into dataclasses that have nested fields (such as access configs).

unstructured_ingest/embed/mixedbreadai.py CHANGED Viewed

@@ -114,7 +114,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
 @dataclass
 class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     config: MixedbreadAIEmbeddingConfig
     async def get_exemplary_embedding(self) -> list[float]:

unstructured_ingest/interfaces/upload_stager.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import BaseModel
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import BaseProcess
 from unstructured_ingest.utils import ndjson
-from unstructured_ingest.utils.data_prep import get_data, write_data
+from unstructured_ingest.utils.data_prep import get_json_data, write_data
 class UploadStagerConfig(BaseModel):
@@ -43,7 +43,7 @@ class UploadStager(BaseProcess, ABC):
                     writer.f.flush()
     def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
-        elements_contents = get_data(path=input_file)
+        elements_contents = get_json_data(path=input_file)
         conformed_elements = [
             self.conform_dict(element_dict=element, file_data=file_data)

unstructured_ingest/interfaces/uploader.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pydantic import BaseModel
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import BaseConnector, BaseProcess
-from unstructured_ingest.utils.data_prep import get_data
+from unstructured_ingest.utils.data_prep import get_json_data
 class UploaderConfig(BaseModel):
@@ -45,11 +45,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
         return False
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         self.run_data(data=data, file_data=file_data, **kwargs)
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         await self.run_data_async(data=data, file_data=file_data, **kwargs)
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/main.py CHANGED Viewed

File without changes

unstructured_ingest/pipeline/interfaces.py CHANGED Viewed

@@ -119,7 +119,7 @@ class PipelineStep(ABC):
         iterable = iterable or []
         if iterable:
             logger.info(
-                f"calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+                f"calling {self.__class__.__name__} with {len(iterable)} docs",  # type: ignore
             )
         else:
             logger.info(f"calling {self.__class__.__name__} with no inputs")

unstructured_ingest/pipeline/pipeline.py CHANGED Viewed

@@ -220,7 +220,7 @@ class Pipeline:
     def _run(self):
         logger.info(
-            f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
+            f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
         )
         if self.context.mp_supported:
             manager = mp.Manager()

unstructured_ingest/processes/chunker.py CHANGED Viewed

@@ -24,6 +24,9 @@ class ChunkerConfig(BaseModel):
         default="https://api.unstructuredapp.io/general/v0/general",
         description="If chunking via api, use the following host.",
     )
+    chunk_api_timeout_ms: Optional[int] = Field(
+        default=None, description="Timeout in milliseconds for all api call during chunking."
+    )
     chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
     chunk_api_key: Optional[SecretStr] = Field(
         default=None, description="API Key for chunking endpoint."
@@ -120,6 +123,7 @@ class Chunker(BaseProcess, ABC):
             api_key=self.config.chunk_api_key.get_secret_value(),
             filename=elements_filepath,
             api_parameters=self.config.to_chunking_kwargs(),
+            timeout_ms=self.config.chunk_api_timeout_ms,
         )
         elements = assign_and_map_hash_ids(elements=elements)

unstructured_ingest/processes/connectors/airtable.py CHANGED Viewed

@@ -3,7 +3,6 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from uuid import NAMESPACE_DNS, uuid5
-import pandas
 from pydantic import BaseModel, Field, Secret, field_validator
 from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
@@ -213,10 +212,13 @@ class AirtableDownloader(Downloader):
         row_dict.update(table_row["fields"])
         return row_dict
+    @requires_dependencies(["pandas"], extras="airtable")
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
+        import pandas as pd
         table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
         table_contents = self.get_table_contents(table_meta=table_meta)
-        df = pandas.DataFrame.from_dict(
+        df = pd.DataFrame.from_dict(
             data=[self._table_row_to_dict(table_row=row) for row in table_contents]
         ).sort_index(axis=1)
         download_path = self.get_download_path(file_data=file_data)

unstructured_ingest/processes/connectors/astradb.py CHANGED Viewed

@@ -43,7 +43,7 @@ from unstructured_ingest.processes.connector_registry import (
 )
 from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
 from unstructured_ingest.utils.constants import RECORD_ID_LABEL
-from unstructured_ingest.utils.data_prep import batch_generator, get_data
+from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
@@ -465,7 +465,7 @@ class AstraDBUploader(Uploader):
             collection.insert_many(chunk)
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         self.run_data(data=data, file_data=file_data, **kwargs)

unstructured_ingest/processes/connectors/azure_ai_search.py CHANGED Viewed

@@ -212,7 +212,7 @@ class AzureAISearchUploader(Uploader):
             raise WriteError(
                 ", ".join(
                     [
-                        f"{error.key}: " f"[{error.status_code}] {error.error_message}"
+                        f"{error.key}: [{error.status_code}] {error.error_message}"
                         for error in errors
                     ],
                 ),

unstructured_ingest/processes/connectors/confluence.py CHANGED Viewed

@@ -125,7 +125,6 @@ class ConfluenceIndexer(Indexer):
     def precheck(self) -> bool:
         try:
             # Attempt to retrieve a list of spaces with limit=1.
             # This should only succeed if all creds are valid
             with self.connection_config.get_client() as client:

unstructured_ingest/processes/connectors/databricks/volumes_aws.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_aws"
 class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint",
+        description="The Databricks account ID for the Databricks accounts endpoint",
     )
     profile: Optional[str] = None
     token: Optional[str] = Field(

unstructured_ingest/processes/connectors/databricks/volumes_azure.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_azure"
 class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint.",
+        description="The Databricks account ID for the Databricks accounts endpoint.",
     )
     profile: Optional[str] = None
     azure_workspace_resource_id: Optional[str] = Field(
@@ -47,7 +47,7 @@ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
     )
     azure_environment: Optional[str] = Field(
         default=None,
-        description="The Azure environment type for a " "specific set of API endpoints",
+        description="The Azure environment type for a specific set of API endpoints",
         examples=["Public", "UsGov", "China", "Germany"],
     )

unstructured_ingest/processes/connectors/databricks/volumes_gcp.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_gcp"
 class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint.",
+        description="The Databricks account ID for the Databricks accounts endpoint.",
     )
     profile: Optional[str] = None
     google_credentials: Optional[str] = None

unstructured_ingest/processes/connectors/databricks/volumes_table.py CHANGED Viewed

@@ -166,8 +166,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
             logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
             cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
             logger.debug(
-                f"migrating content from {catalog_path} to "
-                f"table {self.upload_config.table_name}"
+                f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
             )
             data = get_json_data(path=path)
             columns = data[0].keys()

unstructured_ingest/processes/connectors/delta_table.py CHANGED Viewed

@@ -181,6 +181,7 @@ class DeltaTableUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df, file_data=file_data)
+    @requires_dependencies(["pandas"], extras="delta-table")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/duckdb/base.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import UploadStager
-from unstructured_ingest.utils.data_prep import get_data, get_enhanced_element_id, write_data
+from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 _COLUMNS = (
@@ -81,7 +81,7 @@ class BaseDuckDBUploadStager(UploadStager):
     ) -> Path:
         import pandas as pd
-        elements_contents = get_data(path=elements_filepath)
+        elements_contents = get_json_data(path=elements_filepath)
         output_filename_suffix = Path(elements_filepath).suffix
         output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
         output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)

unstructured_ingest/processes/connectors/duckdb/duckdb.py CHANGED Viewed

@@ -67,9 +67,8 @@ class DuckDBConnectionConfig(ConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
-        with self.get_client() as client:
-            with client.cursor() as cursor:
-                yield cursor
+        with self.get_client() as client, client.cursor() as cursor:
+            yield cursor
 class DuckDBUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class DuckDBUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="duckdb")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df)

unstructured_ingest/processes/connectors/duckdb/motherduck.py CHANGED Viewed

@@ -66,9 +66,8 @@ class MotherDuckConnectionConfig(ConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
-        with self.get_client() as client:
-            with client.cursor() as cursor:
-                yield cursor
+        with self.get_client() as client, client.cursor() as cursor:
+            yield cursor
 class MotherDuckUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class MotherDuckUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="duckdb")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df)

unstructured_ingest/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -134,9 +134,11 @@ class S3Indexer(FsspecIndexer):
         version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
         metadata: dict[str, str] = {}
-        with contextlib.suppress(AttributeError):
-            with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
-                metadata = client.metadata(path=path)
+        with (
+            contextlib.suppress(AttributeError),
+            self.connection_config.get_client(protocol=self.index_config.protocol) as client,
+        ):
+            metadata = client.metadata(path=path)
         record_locator = {
             "protocol": self.index_config.protocol,
             "remote_file_path": self.index_config.remote_url,

unstructured_ingest/processes/connectors/gitlab.py CHANGED Viewed

@@ -230,8 +230,7 @@ class GitLabDownloader(Downloader):
         download_path = self.get_download_path(file_data=file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/google_drive.py CHANGED Viewed

@@ -334,7 +334,6 @@ class GoogleDriveIndexer(Indexer):
         recursive: bool = False,
         previous_path: Optional[str] = None,
     ) -> list[dict]:
         fields_input = "nextPageToken, files({})".format(",".join(self.fields))
         q = f"'{object_id}' in parents"
         # Filter by extension but still include any directories
@@ -394,7 +393,6 @@ class GoogleDriveIndexer(Indexer):
         if not self.is_dir(root_info):
             data = [self.map_file_data(root_info)]
         else:
             file_contents = self.get_paginated_results(
                 files_client=files_client,
                 object_id=object_id,

unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py CHANGED Viewed

@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
-import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.data_types.file_data import FileData
@@ -29,6 +28,7 @@ from unstructured_ingest.utils.data_prep import get_data_df
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
+    from pandas import DataFrame
     from pyarrow import Table as ArrowTable
     from pyiceberg.catalog.rest import RestCatalog
     from pyiceberg.table import Table, Transaction
@@ -96,14 +96,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
             return UserAuthError(e)
         if 400 <= response_code < 500:
             logger.error(
-                f"Request to {url} failed"
-                f"in IBM watsonx.data connector, status code {response_code}"
+                f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
             )
             return UserError(e)
         if response_code > 500:
             logger.error(
-                f"Request to {url} failed"
-                f"in IBM watsonx.data connector, status code {response_code}"
+                f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
             )
             return ProviderError(e)
         logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
@@ -217,7 +215,7 @@ class IbmWatsonxUploader(SQLUploader):
         return self.upload_config.record_id_key in self.get_table_columns()
     @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
-    def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
+    def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
         import pyarrow as pa
         # Iceberg will automatically fill missing columns with nulls
@@ -277,16 +275,20 @@ class IbmWatsonxUploader(SQLUploader):
         except Exception as e:
             raise ProviderError(f"Failed to upload data to table: {e}")
-    def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
+    def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
         data_table = self._df_to_arrow_table(df)
         with self.get_table() as table:
             self.upload_data_table(table, data_table, file_data)
+    @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        import pandas as pd
         df = pd.DataFrame(data)
         self.upload_dataframe(df=df, file_data=file_data)
+    @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path=path)
         self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/kdbai.py CHANGED Viewed

@@ -141,6 +141,7 @@ class KdbaiUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.process_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="kdbai")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         data = get_data_df(path=path)
         self.process_dataframe(df=data)

unstructured_ingest/processes/connectors/outlook.py CHANGED Viewed

@@ -199,8 +199,7 @@ class OutlookDownloader(Downloader):
         download_path = self.get_download_path(file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/pinecone.py CHANGED Viewed

@@ -227,7 +227,6 @@ class PineconeUploader(VectorDBUploader):
         self.connection_config.index_name = index_name
         if not self.index_exists(index_name):
             logger.info(f"creating pinecone index {index_name}")
             pc = self.connection_config.get_client()

unstructured_ingest/processes/connectors/redisdb.py CHANGED Viewed

@@ -143,36 +143,40 @@ class RedisUploader(Uploader):
         await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
     async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
-        async with self.connection_config.create_async_client() as async_client:
-            async with async_client.pipeline(transaction=True) as pipe:
-                for element in batch:
-                    key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
-                    if redis_stack:
-                        pipe.json().set(key_with_prefix, "$", element)
-                    else:
-                        pipe.set(key_with_prefix, json.dumps(element))
-                await pipe.execute()
+        async with (
+            self.connection_config.create_async_client() as async_client,
+            async_client.pipeline(transaction=True) as pipe,
+        ):
+            for element in batch:
+                key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
+                if redis_stack:
+                    pipe.json().set(key_with_prefix, "$", element)
+                else:
+                    pipe.set(key_with_prefix, json.dumps(element))
+            await pipe.execute()
     @requires_dependencies(["redis"], extras="redis")
     async def _check_redis_stack(self, element: dict) -> bool:
         from redis import exceptions as redis_exceptions
         redis_stack = True
-        async with self.connection_config.create_async_client() as async_client:
-            async with async_client.pipeline(transaction=True) as pipe:
-                key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
-                try:
-                    # Redis with stack extension supports JSON type
-                    await pipe.json().set(key_with_prefix, "$", element).execute()
-                except redis_exceptions.ResponseError as e:
-                    message = str(e)
-                    if "unknown command `JSON.SET`" in message:
-                        # if this error occurs, Redis server doesn't support JSON type,
-                        # so save as string type instead
-                        await pipe.set(key_with_prefix, json.dumps(element)).execute()
-                        redis_stack = False
-                    else:
-                        raise e
+        async with (
+            self.connection_config.create_async_client() as async_client,
+            async_client.pipeline(transaction=True) as pipe,
+        ):
+            key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
+            try:
+                # Redis with stack extension supports JSON type
+                await pipe.json().set(key_with_prefix, "$", element).execute()
+            except redis_exceptions.ResponseError as e:
+                message = str(e)
+                if "unknown command `JSON.SET`" in message:
+                    # if this error occurs, Redis server doesn't support JSON type,
+                    # so save as string type instead
+                    await pipe.set(key_with_prefix, json.dumps(element)).execute()
+                    redis_stack = False
+                else:
+                    raise e
         return redis_stack

unstructured_ingest/processes/connectors/salesforce.py CHANGED Viewed

@@ -81,7 +81,7 @@ class SalesforceAccessConfig(AccessConfig):
     consumer_key: str
     private_key_path: Optional[Path] = Field(
         default=None,
-        description="Path to the private key file. " "Key file is usually named server.key.",
+        description="Path to the private key file. Key file is usually named server.key.",
     )
     private_key: Optional[str] = Field(default=None, description="Contents of the private key")

unstructured_ingest/processes/connectors/slack.py CHANGED Viewed

@@ -166,8 +166,7 @@ class SlackDownloader(Downloader):
         download_path = self.get_download_path(file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from contextlib import contextmanager
 from dataclasses import dataclass
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
@@ -128,6 +129,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
     connection_config: DatabricksDeltaTablesConnectionConfig
     connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["pandas"], extras="databricks-delta-tables")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @contextmanager
     def get_cursor(self) -> Generator[Any, None, None]:
         with self.connection_config.get_cursor() as cursor:

unstructured_ingest/processes/connectors/sql/postgres.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Generator, Optional
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
+from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.logger import logger
 from unstructured_ingest.processes.connector_registry import (
     DestinationRegistryEntry,
@@ -144,6 +146,10 @@ class PostgresUploader(SQLUploader):
     connector_type: str = CONNECTOR_TYPE
     values_delimiter: str = "%s"
+    @requires_dependencies(["pandas"], extras="postgres")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
 postgres_source_entry = SourceRegistryEntry(
     connection_config=PostgresConnectionConfig,

unstructured_ingest/processes/connectors/sql/singlestore.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import json
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
+from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.logger import logger
 from unstructured_ingest.processes.connector_registry import (
     DestinationRegistryEntry,
@@ -65,12 +67,11 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
-        with self.get_connection() as connection:
-            with connection.cursor() as cursor:
-                try:
-                    yield cursor
-                finally:
-                    cursor.close()
+        with self.get_connection() as connection, connection.cursor() as cursor:
+            try:
+                yield cursor
+            finally:
+                cursor.close()
 class SingleStoreIndexerConfig(SQLIndexerConfig):
@@ -131,6 +132,10 @@ class SingleStoreUploader(SQLUploader):
     values_delimiter: str = "%s"
     connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["pandas"], extras="singlestore")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["pandas"], extras="singlestore")
     def prepare_data(
         self, columns: list[str], data: tuple[tuple[Any, ...], ...]

unstructured_ingest/processes/connectors/sql/snowflake.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
@@ -173,6 +174,10 @@ class SnowflakeUploader(SQLUploader):
     connector_type: str = CONNECTOR_TYPE
     values_delimiter: str = "?"
+    @requires_dependencies(["pandas"], extras="snowflake")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["pandas"], extras="snowflake")
     def prepare_data(
         self, columns: list[str], data: tuple[tuple[Any, ...], ...]

unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.1py3-none-any.whl