PyPI - unstructured-ingest - Versions diffs - 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/README.md +28 -0
unstructured_ingest/embed/mixedbreadai.py +0 -1
unstructured_ingest/interfaces/upload_stager.py +2 -2
unstructured_ingest/interfaces/uploader.py +3 -3
unstructured_ingest/main.py +0 -0
unstructured_ingest/pipeline/interfaces.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/processes/chunker.py +4 -0
unstructured_ingest/processes/connectors/airtable.py +4 -2
unstructured_ingest/processes/connectors/astradb.py +48 -34
unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
unstructured_ingest/processes/connectors/confluence.py +0 -1
unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
unstructured_ingest/processes/connectors/delta_table.py +1 -0
unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
unstructured_ingest/processes/connectors/gitlab.py +1 -2
unstructured_ingest/processes/connectors/google_drive.py +0 -2
unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
unstructured_ingest/processes/connectors/kdbai.py +1 -0
unstructured_ingest/processes/connectors/outlook.py +1 -2
unstructured_ingest/processes/connectors/pinecone.py +0 -1
unstructured_ingest/processes/connectors/redisdb.py +28 -24
unstructured_ingest/processes/connectors/salesforce.py +1 -1
unstructured_ingest/processes/connectors/slack.py +1 -2
unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
unstructured_ingest/processes/connectors/sql/sql.py +3 -4
unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
unstructured_ingest/processes/connectors/vectara.py +0 -2
unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
unstructured_ingest/processes/embedder.py +2 -2
unstructured_ingest/processes/filter.py +1 -1
unstructured_ingest/processes/partitioner.py +4 -0
unstructured_ingest/processes/utils/blob_storage.py +2 -2
unstructured_ingest/unstructured_api.py +13 -8
unstructured_ingest/utils/data_prep.py +8 -32
unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
examples/__init__.py +0 -0
examples/airtable.py +0 -44
examples/azure_cognitive_search.py +0 -55
examples/chroma.py +0 -54
examples/couchbase.py +0 -55
examples/databricks_volumes_dest.py +0 -55
examples/databricks_volumes_source.py +0 -53
examples/delta_table.py +0 -45
examples/discord_example.py +0 -36
examples/elasticsearch.py +0 -49
examples/google_drive.py +0 -45
examples/kdbai.py +0 -54
examples/local.py +0 -36
examples/milvus.py +0 -44
examples/mongodb.py +0 -53
examples/opensearch.py +0 -50
examples/pinecone.py +0 -57
examples/s3.py +0 -38
examples/salesforce.py +0 -44
examples/sharepoint.py +0 -47
examples/singlestore.py +0 -49
examples/sql.py +0 -90
examples/vectara.py +0 -54
examples/weaviate.py +0 -44
test/__init__.py +0 -0
test/integration/__init__.py +0 -0
test/integration/chunkers/__init__.py +0 -0
test/integration/chunkers/test_chunkers.py +0 -31
test/integration/connectors/__init__.py +0 -0
test/integration/connectors/conftest.py +0 -38
test/integration/connectors/databricks/__init__.py +0 -0
test/integration/connectors/databricks/test_volumes_native.py +0 -273
test/integration/connectors/discord/__init__.py +0 -0
test/integration/connectors/discord/test_discord.py +0 -90
test/integration/connectors/duckdb/__init__.py +0 -0
test/integration/connectors/duckdb/conftest.py +0 -14
test/integration/connectors/duckdb/test_duckdb.py +0 -90
test/integration/connectors/duckdb/test_motherduck.py +0 -95
test/integration/connectors/elasticsearch/__init__.py +0 -0
test/integration/connectors/elasticsearch/conftest.py +0 -34
test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
test/integration/connectors/sql/__init__.py +0 -0
test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
test/integration/connectors/sql/test_postgres.py +0 -201
test/integration/connectors/sql/test_singlestore.py +0 -182
test/integration/connectors/sql/test_snowflake.py +0 -244
test/integration/connectors/sql/test_sqlite.py +0 -168
test/integration/connectors/sql/test_vastdb.py +0 -34
test/integration/connectors/test_astradb.py +0 -287
test/integration/connectors/test_azure_ai_search.py +0 -254
test/integration/connectors/test_chroma.py +0 -136
test/integration/connectors/test_confluence.py +0 -111
test/integration/connectors/test_delta_table.py +0 -183
test/integration/connectors/test_dropbox.py +0 -151
test/integration/connectors/test_github.py +0 -49
test/integration/connectors/test_google_drive.py +0 -257
test/integration/connectors/test_jira.py +0 -67
test/integration/connectors/test_lancedb.py +0 -247
test/integration/connectors/test_milvus.py +0 -208
test/integration/connectors/test_mongodb.py +0 -335
test/integration/connectors/test_neo4j.py +0 -244
test/integration/connectors/test_notion.py +0 -152
test/integration/connectors/test_onedrive.py +0 -163
test/integration/connectors/test_pinecone.py +0 -387
test/integration/connectors/test_qdrant.py +0 -216
test/integration/connectors/test_redis.py +0 -143
test/integration/connectors/test_s3.py +0 -184
test/integration/connectors/test_sharepoint.py +0 -222
test/integration/connectors/test_vectara.py +0 -282
test/integration/connectors/test_zendesk.py +0 -120
test/integration/connectors/utils/__init__.py +0 -0
test/integration/connectors/utils/constants.py +0 -13
test/integration/connectors/utils/docker.py +0 -151
test/integration/connectors/utils/docker_compose.py +0 -59
test/integration/connectors/utils/validation/__init__.py +0 -0
test/integration/connectors/utils/validation/destination.py +0 -77
test/integration/connectors/utils/validation/equality.py +0 -76
test/integration/connectors/utils/validation/source.py +0 -331
test/integration/connectors/utils/validation/utils.py +0 -36
test/integration/connectors/weaviate/__init__.py +0 -0
test/integration/connectors/weaviate/conftest.py +0 -15
test/integration/connectors/weaviate/test_cloud.py +0 -39
test/integration/connectors/weaviate/test_local.py +0 -152
test/integration/embedders/__init__.py +0 -0
test/integration/embedders/conftest.py +0 -13
test/integration/embedders/test_azure_openai.py +0 -57
test/integration/embedders/test_bedrock.py +0 -103
test/integration/embedders/test_huggingface.py +0 -24
test/integration/embedders/test_mixedbread.py +0 -71
test/integration/embedders/test_octoai.py +0 -75
test/integration/embedders/test_openai.py +0 -74
test/integration/embedders/test_togetherai.py +0 -71
test/integration/embedders/test_vertexai.py +0 -63
test/integration/embedders/test_voyageai.py +0 -79
test/integration/embedders/utils.py +0 -66
test/integration/partitioners/__init__.py +0 -0
test/integration/partitioners/test_partitioner.py +0 -76
test/integration/utils.py +0 -15
test/unit/__init__.py +0 -0
test/unit/chunkers/__init__.py +0 -0
test/unit/chunkers/test_chunkers.py +0 -49
test/unit/connectors/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
test/unit/connectors/motherduck/__init__.py +0 -0
test/unit/connectors/motherduck/test_base.py +0 -73
test/unit/connectors/sql/__init__.py +0 -0
test/unit/connectors/sql/test_sql.py +0 -152
test/unit/connectors/test_confluence.py +0 -71
test/unit/connectors/test_jira.py +0 -401
test/unit/embed/__init__.py +0 -0
test/unit/embed/test_mixedbreadai.py +0 -42
test/unit/embed/test_octoai.py +0 -27
test/unit/embed/test_openai.py +0 -28
test/unit/embed/test_vertexai.py +0 -25
test/unit/embed/test_voyageai.py +0 -24
test/unit/embedders/__init__.py +0 -0
test/unit/embedders/test_bedrock.py +0 -36
test/unit/embedders/test_huggingface.py +0 -48
test/unit/embedders/test_mixedbread.py +0 -37
test/unit/embedders/test_octoai.py +0 -35
test/unit/embedders/test_openai.py +0 -35
test/unit/embedders/test_togetherai.py +0 -37
test/unit/embedders/test_vertexai.py +0 -37
test/unit/embedders/test_voyageai.py +0 -38
test/unit/partitioners/__init__.py +0 -0
test/unit/partitioners/test_partitioner.py +0 -63
test/unit/test_error.py +0 -27
test/unit/test_html.py +0 -112
test/unit/test_interfaces.py +0 -26
test/unit/test_utils.py +0 -220
test/unit/utils/__init__.py +0 -0
test/unit/utils/data_generator.py +0 -32
unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0

unstructured_ingest/processes/connectors/redisdb.py CHANGED Viewed

@@ -143,36 +143,40 @@ class RedisUploader(Uploader):
         await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
     async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
-        async with self.connection_config.create_async_client() as async_client:
-            async with async_client.pipeline(transaction=True) as pipe:
-                for element in batch:
-                    key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
-                    if redis_stack:
-                        pipe.json().set(key_with_prefix, "$", element)
-                    else:
-                        pipe.set(key_with_prefix, json.dumps(element))
-                await pipe.execute()
+        async with (
+            self.connection_config.create_async_client() as async_client,
+            async_client.pipeline(transaction=True) as pipe,
+        ):
+            for element in batch:
+                key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
+                if redis_stack:
+                    pipe.json().set(key_with_prefix, "$", element)
+                else:
+                    pipe.set(key_with_prefix, json.dumps(element))
+            await pipe.execute()
     @requires_dependencies(["redis"], extras="redis")
     async def _check_redis_stack(self, element: dict) -> bool:
         from redis import exceptions as redis_exceptions
         redis_stack = True
-        async with self.connection_config.create_async_client() as async_client:
-            async with async_client.pipeline(transaction=True) as pipe:
-                key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
-                try:
-                    # Redis with stack extension supports JSON type
-                    await pipe.json().set(key_with_prefix, "$", element).execute()
-                except redis_exceptions.ResponseError as e:
-                    message = str(e)
-                    if "unknown command `JSON.SET`" in message:
-                        # if this error occurs, Redis server doesn't support JSON type,
-                        # so save as string type instead
-                        await pipe.set(key_with_prefix, json.dumps(element)).execute()
-                        redis_stack = False
-                    else:
-                        raise e
+        async with (
+            self.connection_config.create_async_client() as async_client,
+            async_client.pipeline(transaction=True) as pipe,
+        ):
+            key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
+            try:
+                # Redis with stack extension supports JSON type
+                await pipe.json().set(key_with_prefix, "$", element).execute()
+            except redis_exceptions.ResponseError as e:
+                message = str(e)
+                if "unknown command `JSON.SET`" in message:
+                    # if this error occurs, Redis server doesn't support JSON type,
+                    # so save as string type instead
+                    await pipe.set(key_with_prefix, json.dumps(element)).execute()
+                    redis_stack = False
+                else:
+                    raise e
         return redis_stack

unstructured_ingest/processes/connectors/salesforce.py CHANGED Viewed

@@ -81,7 +81,7 @@ class SalesforceAccessConfig(AccessConfig):
     consumer_key: str
     private_key_path: Optional[Path] = Field(
         default=None,
-        description="Path to the private key file. " "Key file is usually named server.key.",
+        description="Path to the private key file. Key file is usually named server.key.",
     )
     private_key: Optional[str] = Field(default=None, description="Contents of the private key")

unstructured_ingest/processes/connectors/slack.py CHANGED Viewed

@@ -166,8 +166,7 @@ class SlackDownloader(Downloader):
         download_path = self.get_download_path(file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from contextlib import contextmanager
 from dataclasses import dataclass
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
@@ -128,6 +129,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
     connection_config: DatabricksDeltaTablesConnectionConfig
     connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["pandas"], extras="databricks-delta-tables")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @contextmanager
     def get_cursor(self) -> Generator[Any, None, None]:
         with self.connection_config.get_cursor() as cursor:

unstructured_ingest/processes/connectors/sql/postgres.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Generator, Optional
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
+from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.logger import logger
 from unstructured_ingest.processes.connector_registry import (
     DestinationRegistryEntry,
@@ -144,6 +146,10 @@ class PostgresUploader(SQLUploader):
     connector_type: str = CONNECTOR_TYPE
     values_delimiter: str = "%s"
+    @requires_dependencies(["pandas"], extras="postgres")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
 postgres_source_entry = SourceRegistryEntry(
     connection_config=PostgresConnectionConfig,

unstructured_ingest/processes/connectors/sql/singlestore.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import json
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
+from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.logger import logger
 from unstructured_ingest.processes.connector_registry import (
     DestinationRegistryEntry,
@@ -65,12 +67,11 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
-        with self.get_connection() as connection:
-            with connection.cursor() as cursor:
-                try:
-                    yield cursor
-                finally:
-                    cursor.close()
+        with self.get_connection() as connection, connection.cursor() as cursor:
+            try:
+                yield cursor
+            finally:
+                cursor.close()
 class SingleStoreIndexerConfig(SQLIndexerConfig):
@@ -131,6 +132,10 @@ class SingleStoreUploader(SQLUploader):
     values_delimiter: str = "%s"
     connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["pandas"], extras="singlestore")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["pandas"], extras="singlestore")
     def prepare_data(
         self, columns: list[str], data: tuple[tuple[Any, ...], ...]

unstructured_ingest/processes/connectors/sql/snowflake.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
@@ -173,6 +174,10 @@ class SnowflakeUploader(SQLUploader):
     connector_type: str = CONNECTOR_TYPE
     values_delimiter: str = "?"
+    @requires_dependencies(["pandas"], extras="snowflake")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["pandas"], extras="snowflake")
     def prepare_data(
         self, columns: list[str], data: tuple[tuple[Any, ...], ...]

unstructured_ingest/processes/connectors/sql/sql.py CHANGED Viewed

@@ -36,9 +36,9 @@ from unstructured_ingest.interfaces import (
 from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.constants import RECORD_ID_LABEL
 from unstructured_ingest.utils.data_prep import (
-    get_data,
     get_data_df,
     get_enhanced_element_id,
+    get_json_data,
     split_dataframe,
     write_data,
 )
@@ -122,8 +122,7 @@ class SQLIndexer(Indexer, ABC):
         id_batches: list[frozenset[str]] = [
             frozenset(
                 ids[
-                    i
-                    * self.index_config.batch_size : (i + 1)  # noqa
+                    i * self.index_config.batch_size : (i + 1)  # noqa
                     * self.index_config.batch_size
                 ]
             )
@@ -272,7 +271,7 @@ class SQLUploadStager(UploadStager):
     ) -> Path:
         import pandas as pd
-        elements_contents = get_data(path=elements_filepath)
+        elements_contents = get_json_data(path=elements_filepath)
         df = pd.DataFrame(
             data=[

unstructured_ingest/processes/connectors/sql/sqlite.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Generator
 from pydantic import Field, Secret, model_validator
+from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.logger import logger
 from unstructured_ingest.processes.connector_registry import (
     DestinationRegistryEntry,
@@ -133,6 +134,10 @@ class SQLiteUploader(SQLUploader):
     connection_config: SQLiteConnectionConfig
     connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["pandas"])
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["pandas"])
     def prepare_data(
         self, columns: list[str], data: tuple[tuple[Any, ...], ...]

unstructured_ingest/processes/connectors/sql/vastdb.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional
 from pydantic import Field, Secret
@@ -68,9 +69,8 @@ class VastdbConnectionConfig(SQLConnectionConfig):
     @contextmanager
     def get_cursor(self) -> "VastdbTransaction":
-        with self.get_connection() as connection:
-            with connection.transaction() as transaction:
-                yield transaction
+        with self.get_connection() as connection, connection.transaction() as transaction:
+            yield transaction
     @contextmanager
     def get_table(self, table_name: str) -> "VastdbTable":
@@ -190,6 +190,10 @@ class VastdbUploader(SQLUploader):
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
+    @requires_dependencies(["pandas"], extras="vastdb")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
     @requires_dependencies(["pyarrow", "pandas"], extras="vastdb")
     def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
         import numpy as np

unstructured_ingest/processes/connectors/vectara.py CHANGED Viewed

@@ -108,7 +108,6 @@ class VectaraUploaderConfig(UploaderConfig):
 @dataclass
 class VectaraUploader(Uploader):
     connector_type: str = CONNECTOR_TYPE
     upload_config: VectaraUploaderConfig
     connection_config: VectaraConnectionConfig
@@ -336,7 +335,6 @@ class VectaraUploader(Uploader):
         file_data: FileData,
         **kwargs: Any,
     ) -> None:
         logger.info(f"inserting / updating {len(data)} documents to Vectara ")
         await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))

unstructured_ingest/processes/connectors/zendesk/zendesk.py CHANGED Viewed

@@ -53,7 +53,6 @@ class ZendeskConnectionConfig(ConnectionConfig):
     access_config: Secret[ZendeskAccessConfig]
     def get_client(self) -> ZendeskClient:
         access_config = self.access_config.get_secret_value()
         return ZendeskClient(
@@ -206,7 +205,6 @@ class ZendeskDownloader(Downloader):
                     await f.write(comment.as_text())
     async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         zendesk_filedata = ZendeskFileData.cast(file_data=file_data)
         item_type = zendesk_filedata.additional_metadata.item_type

unstructured_ingest/processes/embedder.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
 from pydantic import BaseModel, Field, SecretStr
 from unstructured_ingest.interfaces.process import BaseProcess
-from unstructured_ingest.utils.data_prep import get_data
+from unstructured_ingest.utils.data_prep import get_json_data
 if TYPE_CHECKING:
     from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
@@ -192,7 +192,7 @@ class Embedder(BaseProcess, ABC):
     def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
         # TODO update base embedder classes to support async
         embedder = self.config.get_embedder()
-        elements = get_data(path=elements_filepath)
+        elements = get_json_data(path=elements_filepath)
         if not elements:
             return []
         embedded_elements = embedder.embed_documents(elements=elements)

unstructured_ingest/processes/filter.py CHANGED Viewed

@@ -13,7 +13,7 @@ from unstructured_ingest.logger import logger
 class FiltererConfig(BaseModel):
     file_glob: Optional[list[str]] = Field(
         default=None,
-        description="file globs to limit which data_types of " "files are accepted",
+        description="file globs to limit which data_types of files are accepted",
         examples=["*.pdf", "*.html"],
     )
     max_file_size: Optional[int] = Field(

unstructured_ingest/processes/partitioner.py CHANGED Viewed

@@ -68,6 +68,9 @@ class PartitionerConfig(BaseModel):
         description="Use a remote API to partition the files."
         " Otherwise, use the function from partition.auto",
     )
+    api_timeout_ms: Optional[int] = Field(
+        default=None, description="Timeout in milliseconds for all api call during partitioning."
+    )
     api_key: Optional[SecretStr] = Field(
         default=None, description="API Key for partition endpoint."
     )
@@ -188,6 +191,7 @@ class Partitioner(BaseProcess, ABC):
             api_key=self.config.api_key.get_secret_value(),
             filename=filename,
             api_parameters=self.config.to_partition_kwargs(),
+            timeout_ms=self.config.api_timeout_ms,
         )
         # Append the data source metadata the auto partition does for you

unstructured_ingest/processes/utils/blob_storage.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import UploadStager, UploadStagerConfig
-from unstructured_ingest.utils.data_prep import get_data, write_data
+from unstructured_ingest.utils.data_prep import get_json_data, write_data
 class BlobStoreUploadStagerConfig(UploadStagerConfig):
@@ -27,6 +27,6 @@ class BlobStoreUploadStager(UploadStager):
     ) -> Path:
         output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
         # Always save as json
-        data = get_data(elements_filepath)
+        data = get_json_data(elements_filepath)
         write_data(path=output_file.with_suffix(".json"), data=data)
         return output_file.with_suffix(".json")

unstructured_ingest/unstructured_api.py CHANGED Viewed

@@ -80,7 +80,11 @@ def wrap_error(e: Exception) -> Exception:
 async def call_api_async(
-    server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
+    server_url: Optional[str],
+    api_key: Optional[str],
+    filename: Path,
+    api_parameters: dict,
+    timeout_ms: Optional[int] = None,
 ) -> list[dict]:
     """Call the Unstructured API using unstructured-client.
@@ -94,13 +98,10 @@ async def call_api_async(
     """
     from unstructured_client import UnstructuredClient
-    client = UnstructuredClient(
-        server_url=server_url,
-        api_key_auth=api_key,
-    )
+    client = UnstructuredClient(server_url=server_url, api_key_auth=api_key)
     partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
     try:
-        res = await client.general.partition_async(request=partition_request)
+        res = await client.general.partition_async(request=partition_request, timeout_ms=timeout_ms)
     except Exception as e:
         raise wrap_error(e)
@@ -108,7 +109,11 @@ async def call_api_async(
 def call_api(
-    server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
+    server_url: Optional[str],
+    api_key: Optional[str],
+    filename: Path,
+    api_parameters: dict,
+    timeout_ms: Optional[int] = None,
 ) -> list[dict]:
     """Call the Unstructured API using unstructured-client.
@@ -128,7 +133,7 @@ def call_api(
     )
     partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
     try:
-        res = client.general.partition(request=partition_request)
+        res = client.general.partition(request=partition_request, timeout_ms=timeout_ms)
     except Exception as e:
         raise wrap_error(e)

unstructured_ingest/utils/data_prep.py CHANGED Viewed

@@ -2,7 +2,7 @@ import itertools
 import json
 from datetime import datetime
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
+from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
 from uuid import NAMESPACE_DNS, uuid5
 from unstructured_ingest.data_types.file_data import FileData
@@ -171,15 +171,13 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
             raise IOError("Unsupported file type: {path}")
-def get_data(path: Union[Path, str]) -> list[dict]:
-    if isinstance(path, str):
-        path = Path(path)
-    try:
-        return get_data_by_suffix(path=path)
-    except Exception as e:
-        logger.warning(f"failed to read {path} by extension: {e}")
-    # Fall back
+def get_json_data(path: Path) -> list[dict]:
     with path.open() as f:
+        # Attempt by prefix
+        if path.suffix == ".json":
+            return json.load(f)
+        elif path.suffix == ".ndjson":
+            return ndjson.load(f)
         try:
             return json.load(f)
         except Exception as e:
@@ -188,29 +186,7 @@ def get_data(path: Union[Path, str]) -> list[dict]:
             return ndjson.load(f)
         except Exception as e:
             logger.warning(f"failed to read {path} as ndjson: {e}")
-        import pandas as pd
-        try:
-            df = pd.read_csv(path)
-            return df.to_dict(orient="records")
-        except Exception as e:
-            logger.warning(f"failed to read {path} as csv: {e}")
-        try:
-            df = pd.read_parquet(path)
-            return df.to_dict(orient="records")
-        except Exception as e:
-            logger.warning(f"failed to read {path} as parquet: {e}")
-def get_json_data(path: Path) -> list[dict]:
-    with path.open() as f:
-        if path.suffix == ".json":
-            return json.load(f)
-        elif path.suffix == ".ndjson":
-            return ndjson.load(f)
-        else:
-            raise ValueError(f"Unsupported file type: {path}")
+    raise ValueError(f"Unsupported json file: {path}")
 @requires_dependencies(["pandas"])

unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.2py3-none-any.whl