PyPI - unstructured-ingest - Versions diffs - 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

unstructured-ingest 0.7.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/README.md +28 -0
unstructured_ingest/embed/mixedbreadai.py +0 -1
unstructured_ingest/interfaces/upload_stager.py +2 -2
unstructured_ingest/interfaces/uploader.py +3 -3
unstructured_ingest/logger.py +2 -93
unstructured_ingest/main.py +0 -0
unstructured_ingest/pipeline/interfaces.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/processes/chunker.py +4 -0
unstructured_ingest/processes/connectors/airtable.py +4 -2
unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
unstructured_ingest/processes/connectors/astradb.py +2 -2
unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
unstructured_ingest/processes/connectors/confluence.py +0 -1
unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
unstructured_ingest/processes/connectors/delta_table.py +1 -0
unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
unstructured_ingest/processes/connectors/gitlab.py +1 -2
unstructured_ingest/processes/connectors/google_drive.py +0 -2
unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
unstructured_ingest/processes/connectors/kdbai.py +1 -0
unstructured_ingest/processes/connectors/outlook.py +1 -2
unstructured_ingest/processes/connectors/pinecone.py +0 -1
unstructured_ingest/processes/connectors/redisdb.py +28 -24
unstructured_ingest/processes/connectors/salesforce.py +1 -1
unstructured_ingest/processes/connectors/slack.py +1 -2
unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
unstructured_ingest/processes/connectors/sql/sql.py +3 -4
unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
unstructured_ingest/processes/connectors/vectara.py +0 -2
unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
unstructured_ingest/processes/embedder.py +2 -2
unstructured_ingest/processes/filter.py +1 -1
unstructured_ingest/processes/partitioner.py +4 -0
unstructured_ingest/processes/utils/blob_storage.py +2 -2
unstructured_ingest/unstructured_api.py +13 -8
unstructured_ingest/utils/data_prep.py +8 -32
unstructured_ingest/utils/string_and_date_utils.py +3 -3
unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
{unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
{unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
examples/__init__.py +0 -0
examples/airtable.py +0 -44
examples/azure_cognitive_search.py +0 -55
examples/chroma.py +0 -54
examples/couchbase.py +0 -55
examples/databricks_volumes_dest.py +0 -55
examples/databricks_volumes_source.py +0 -53
examples/delta_table.py +0 -45
examples/discord_example.py +0 -36
examples/elasticsearch.py +0 -49
examples/google_drive.py +0 -45
examples/kdbai.py +0 -54
examples/local.py +0 -36
examples/milvus.py +0 -44
examples/mongodb.py +0 -53
examples/opensearch.py +0 -50
examples/pinecone.py +0 -57
examples/s3.py +0 -38
examples/salesforce.py +0 -44
examples/sharepoint.py +0 -47
examples/singlestore.py +0 -49
examples/sql.py +0 -90
examples/vectara.py +0 -54
examples/weaviate.py +0 -44
test/__init__.py +0 -0
test/integration/__init__.py +0 -0
test/integration/chunkers/__init__.py +0 -0
test/integration/chunkers/test_chunkers.py +0 -31
test/integration/connectors/__init__.py +0 -0
test/integration/connectors/conftest.py +0 -38
test/integration/connectors/databricks/__init__.py +0 -0
test/integration/connectors/databricks/test_volumes_native.py +0 -273
test/integration/connectors/discord/__init__.py +0 -0
test/integration/connectors/discord/test_discord.py +0 -90
test/integration/connectors/duckdb/__init__.py +0 -0
test/integration/connectors/duckdb/conftest.py +0 -14
test/integration/connectors/duckdb/test_duckdb.py +0 -90
test/integration/connectors/duckdb/test_motherduck.py +0 -95
test/integration/connectors/elasticsearch/__init__.py +0 -0
test/integration/connectors/elasticsearch/conftest.py +0 -34
test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
test/integration/connectors/sql/__init__.py +0 -0
test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
test/integration/connectors/sql/test_postgres.py +0 -201
test/integration/connectors/sql/test_singlestore.py +0 -182
test/integration/connectors/sql/test_snowflake.py +0 -244
test/integration/connectors/sql/test_sqlite.py +0 -168
test/integration/connectors/sql/test_vastdb.py +0 -34
test/integration/connectors/test_astradb.py +0 -287
test/integration/connectors/test_azure_ai_search.py +0 -254
test/integration/connectors/test_chroma.py +0 -136
test/integration/connectors/test_confluence.py +0 -111
test/integration/connectors/test_delta_table.py +0 -183
test/integration/connectors/test_dropbox.py +0 -151
test/integration/connectors/test_github.py +0 -49
test/integration/connectors/test_google_drive.py +0 -257
test/integration/connectors/test_jira.py +0 -67
test/integration/connectors/test_lancedb.py +0 -247
test/integration/connectors/test_milvus.py +0 -208
test/integration/connectors/test_mongodb.py +0 -335
test/integration/connectors/test_neo4j.py +0 -244
test/integration/connectors/test_notion.py +0 -152
test/integration/connectors/test_onedrive.py +0 -163
test/integration/connectors/test_pinecone.py +0 -387
test/integration/connectors/test_qdrant.py +0 -216
test/integration/connectors/test_redis.py +0 -143
test/integration/connectors/test_s3.py +0 -184
test/integration/connectors/test_sharepoint.py +0 -222
test/integration/connectors/test_vectara.py +0 -282
test/integration/connectors/test_zendesk.py +0 -120
test/integration/connectors/utils/__init__.py +0 -0
test/integration/connectors/utils/constants.py +0 -13
test/integration/connectors/utils/docker.py +0 -151
test/integration/connectors/utils/docker_compose.py +0 -59
test/integration/connectors/utils/validation/__init__.py +0 -0
test/integration/connectors/utils/validation/destination.py +0 -77
test/integration/connectors/utils/validation/equality.py +0 -76
test/integration/connectors/utils/validation/source.py +0 -331
test/integration/connectors/utils/validation/utils.py +0 -36
test/integration/connectors/weaviate/__init__.py +0 -0
test/integration/connectors/weaviate/conftest.py +0 -15
test/integration/connectors/weaviate/test_cloud.py +0 -39
test/integration/connectors/weaviate/test_local.py +0 -152
test/integration/embedders/__init__.py +0 -0
test/integration/embedders/conftest.py +0 -13
test/integration/embedders/test_azure_openai.py +0 -57
test/integration/embedders/test_bedrock.py +0 -103
test/integration/embedders/test_huggingface.py +0 -24
test/integration/embedders/test_mixedbread.py +0 -71
test/integration/embedders/test_octoai.py +0 -75
test/integration/embedders/test_openai.py +0 -74
test/integration/embedders/test_togetherai.py +0 -71
test/integration/embedders/test_vertexai.py +0 -63
test/integration/embedders/test_voyageai.py +0 -79
test/integration/embedders/utils.py +0 -66
test/integration/partitioners/__init__.py +0 -0
test/integration/partitioners/test_partitioner.py +0 -76
test/integration/utils.py +0 -15
test/unit/__init__.py +0 -0
test/unit/chunkers/__init__.py +0 -0
test/unit/chunkers/test_chunkers.py +0 -49
test/unit/connectors/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
test/unit/connectors/motherduck/__init__.py +0 -0
test/unit/connectors/motherduck/test_base.py +0 -73
test/unit/connectors/sql/__init__.py +0 -0
test/unit/connectors/sql/test_sql.py +0 -152
test/unit/connectors/test_confluence.py +0 -71
test/unit/connectors/test_jira.py +0 -401
test/unit/embed/__init__.py +0 -0
test/unit/embed/test_mixedbreadai.py +0 -42
test/unit/embed/test_octoai.py +0 -27
test/unit/embed/test_openai.py +0 -28
test/unit/embed/test_vertexai.py +0 -25
test/unit/embed/test_voyageai.py +0 -24
test/unit/embedders/__init__.py +0 -0
test/unit/embedders/test_bedrock.py +0 -36
test/unit/embedders/test_huggingface.py +0 -48
test/unit/embedders/test_mixedbread.py +0 -37
test/unit/embedders/test_octoai.py +0 -35
test/unit/embedders/test_openai.py +0 -35
test/unit/embedders/test_togetherai.py +0 -37
test/unit/embedders/test_vertexai.py +0 -37
test/unit/embedders/test_voyageai.py +0 -38
test/unit/partitioners/__init__.py +0 -0
test/unit/partitioners/test_partitioner.py +0 -63
test/unit/test_error.py +0 -27
test/unit/test_html.py +0 -112
test/unit/test_interfaces.py +0 -26
test/unit/test_logger.py +0 -78
test/unit/test_utils.py +0 -220
test/unit/utils/__init__.py +0 -0
test/unit/utils/data_generator.py +0 -32
unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
{unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.7.1" # pragma: no cover
1	+ __version__ = "1.0.1" # pragma: no cover

unstructured_ingest/cli/README.md ADDED Viewed

@@ -0,0 +1,28 @@
+# Ingest CLI
+This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
+## Design Reference
+[cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
+source and destination connectors.
+To manually run the cli:
+```shell
+PYTHONPATH=. python unstructured_ingest/v2/main.py --help
+```
+The `main.py` file simply wraps the generated Click command created in `cli.py`.
+### Source Commands
+All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
+different connectors with shared and unique parameters.
+### Destination Commands
+All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
+sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
+generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as  `Destinations`).
+### Configs
+The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
+are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
+Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
+into dataclasses that have nested fields (such as access configs).

unstructured_ingest/embed/mixedbreadai.py CHANGED Viewed

@@ -114,7 +114,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
 @dataclass
 class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
     config: MixedbreadAIEmbeddingConfig
     async def get_exemplary_embedding(self) -> list[float]:

unstructured_ingest/interfaces/upload_stager.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import BaseModel
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import BaseProcess
 from unstructured_ingest.utils import ndjson
-from unstructured_ingest.utils.data_prep import get_data, write_data
+from unstructured_ingest.utils.data_prep import get_json_data, write_data
 class UploadStagerConfig(BaseModel):
@@ -43,7 +43,7 @@ class UploadStager(BaseProcess, ABC):
                     writer.f.flush()
     def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
-        elements_contents = get_data(path=input_file)
+        elements_contents = get_json_data(path=input_file)
         conformed_elements = [
             self.conform_dict(element_dict=element, file_data=file_data)

unstructured_ingest/interfaces/uploader.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pydantic import BaseModel
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import BaseConnector, BaseProcess
-from unstructured_ingest.utils.data_prep import get_data
+from unstructured_ingest.utils.data_prep import get_json_data
 class UploaderConfig(BaseModel):
@@ -45,11 +45,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
         return False
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         self.run_data(data=data, file_data=file_data, **kwargs)
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         await self.run_data_async(data=data, file_data=file_data, **kwargs)
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/logger.py CHANGED Viewed

@@ -1,99 +1,8 @@
-import ast
-import json
 import logging
-import typing as t
 logger = logging.getLogger("unstructured_ingest")
-def default_is_data_sensitive(k: str, v: t.Any) -> bool:
-    sensitive_fields = [
-        "account_name",
-        "client_id",
-    ]
-    sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
-    return (
-        v
-        and any([s in k.lower() for s in sensitive_triggers])  # noqa: C419
-        or k.lower() in sensitive_fields
-    )
-def hide_sensitive_fields(
-    data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive
-) -> dict:
-    """
-    Will recursively look through every k, v pair in this dict and any nested ones and run
-    is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
-    any string value can be parsed as valid json and process that dict as well and replace
-    the original string with the json.dumps() version of the redacted dict.
-    """
-    new_data = data.copy()
-    for k, v in new_data.items():
-        if is_sensitive_fn(k, v):
-            new_data[k] = "*******"
-        if isinstance(v, dict):
-            new_data[k] = hide_sensitive_fields(v)
-        if isinstance(v, str):
-            # Need to take into account strings generated via json.dumps() or simply printing a dict
-            try:
-                json_data = json.loads(v)
-                if isinstance(json_data, dict):
-                    updated_data = hide_sensitive_fields(json_data)
-                    new_data[k] = json.dumps(updated_data)
-            except json.JSONDecodeError:
-                pass
-    return new_data
-def redact_jsons(s: str) -> str:
-    """
-    Takes in a generic string and pulls out all valid json content. Leverages
-    hide_sensitive_fields() to redact any sensitive information and replaces the
-    original json with the new redacted format. There can be any number of valid
-    jsons in a generic string and this will work. Having extra '{' without a
-    closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
-    """
-    chars = list(s)
-    if "{" not in chars:
-        return s
-    i = 0
-    jsons = []
-    i = 0
-    while i < len(chars):
-        char = chars[i]
-        if char == "{":
-            stack = [char]
-            current = [char]
-            while len(stack) != 0 and i < len(chars):
-                i += 1
-                char = chars[i]
-                current.append(char)
-                if char == "{":
-                    stack.append(char)
-                if char == "}":
-                    stack.pop(-1)
-            jsons.append("".join(current))
-            continue
-        i += 1
-    for j in jsons:
-        try:
-            formatted_j = json.dumps(json.loads(j))
-        except json.JSONDecodeError:
-            formatted_j = json.dumps(ast.literal_eval(j))
-        hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
-        s = s.replace(j, hidden_j)
-    return s
-class SensitiveFormatter(logging.Formatter):
-    def format(self, record):
-        s = super().format(record=record)
-        return redact_jsons(s)
 def remove_root_handlers(logger: logging.Logger) -> None:
     # NOTE(robinson): in some environments such as Google Colab, there is a root handler
     # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
@@ -106,7 +15,7 @@ def remove_root_handlers(logger: logging.Logger) -> None:
 def ingest_log_streaming_init(level: int) -> None:
     handler = logging.StreamHandler()
     handler.name = "ingest_log_handler"
-    formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
+    formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
     handler.setFormatter(formatter)
     # Only want to add the handler once
@@ -122,7 +31,7 @@ def make_default_logger(level: int) -> logging.Logger:
     logger = logging.getLogger("unstructured_ingest")
     handler = logging.StreamHandler()
     handler.name = "ingest_log_handler"
-    formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
+    formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
     handler.setFormatter(formatter)
     logger.addHandler(handler)
     logger.setLevel(level)

unstructured_ingest/main.py CHANGED Viewed

File without changes

unstructured_ingest/pipeline/interfaces.py CHANGED Viewed

@@ -119,7 +119,7 @@ class PipelineStep(ABC):
         iterable = iterable or []
         if iterable:
             logger.info(
-                f"calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+                f"calling {self.__class__.__name__} with {len(iterable)} docs",  # type: ignore
             )
         else:
             logger.info(f"calling {self.__class__.__name__} with no inputs")

unstructured_ingest/pipeline/pipeline.py CHANGED Viewed

@@ -220,7 +220,7 @@ class Pipeline:
     def _run(self):
         logger.info(
-            f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
+            f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
         )
         if self.context.mp_supported:
             manager = mp.Manager()

unstructured_ingest/processes/chunker.py CHANGED Viewed

@@ -24,6 +24,9 @@ class ChunkerConfig(BaseModel):
         default="https://api.unstructuredapp.io/general/v0/general",
         description="If chunking via api, use the following host.",
     )
+    chunk_api_timeout_ms: Optional[int] = Field(
+        default=None, description="Timeout in milliseconds for all api call during chunking."
+    )
     chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
     chunk_api_key: Optional[SecretStr] = Field(
         default=None, description="API Key for chunking endpoint."
@@ -120,6 +123,7 @@ class Chunker(BaseProcess, ABC):
             api_key=self.config.chunk_api_key.get_secret_value(),
             filename=elements_filepath,
             api_parameters=self.config.to_chunking_kwargs(),
+            timeout_ms=self.config.chunk_api_timeout_ms,
         )
         elements = assign_and_map_hash_ids(elements=elements)

unstructured_ingest/processes/connectors/airtable.py CHANGED Viewed

@@ -3,7 +3,6 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
 from uuid import NAMESPACE_DNS, uuid5
-import pandas
 from pydantic import BaseModel, Field, Secret, field_validator
 from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
@@ -213,10 +212,13 @@ class AirtableDownloader(Downloader):
         row_dict.update(table_row["fields"])
         return row_dict
+    @requires_dependencies(["pandas"], extras="airtable")
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
+        import pandas as pd
         table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
         table_contents = self.get_table_contents(table_meta=table_meta)
-        df = pandas.DataFrame.from_dict(
+        df = pd.DataFrame.from_dict(
             data=[self._table_row_to_dict(table_row=row) for row in table_contents]
         ).sort_index(axis=1)
         download_path = self.get_download_path(file_data=file_data)

unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql ADDED Viewed

@@ -0,0 +1,10 @@
+CREATE TABLE elements (
+    id STRING NOT NULL PRIMARY KEY,
+    record_id STRING NOT NULL,
+    element_id STRING NOT NULL,
+    text STRING,
+    embeddings ARRAY<FLOAT>,
+    type STRING,
+    metadata VARIANT
+);

unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json ADDED Viewed

@@ -0,0 +1,23 @@
+{
+    "properties": [
+        {
+            "dataType": [
+                "text"
+            ],
+            "indexFilterable": true,
+            "indexSearchable": true,
+            "name": "record_id",
+            "tokenization": "word"
+        },
+        {
+            "dataType": [
+                "text"
+            ],
+            "indexFilterable": true,
+            "indexSearchable": true,
+            "name": "text",
+            "tokenization": "word"
+        }
+    ],
+    "vectorizer": "none"
+}

unstructured_ingest/processes/connectors/astradb.py CHANGED Viewed

@@ -43,7 +43,7 @@ from unstructured_ingest.processes.connector_registry import (
 )
 from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
 from unstructured_ingest.utils.constants import RECORD_ID_LABEL
-from unstructured_ingest.utils.data_prep import batch_generator, get_data
+from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
@@ -465,7 +465,7 @@ class AstraDBUploader(Uploader):
             collection.insert_many(chunk)
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        data = get_data(path=path)
+        data = get_json_data(path=path)
         self.run_data(data=data, file_data=file_data, **kwargs)

unstructured_ingest/processes/connectors/azure_ai_search.py CHANGED Viewed

@@ -212,7 +212,7 @@ class AzureAISearchUploader(Uploader):
             raise WriteError(
                 ", ".join(
                     [
-                        f"{error.key}: " f"[{error.status_code}] {error.error_message}"
+                        f"{error.key}: [{error.status_code}] {error.error_message}"
                         for error in errors
                     ],
                 ),

unstructured_ingest/processes/connectors/confluence.py CHANGED Viewed

@@ -125,7 +125,6 @@ class ConfluenceIndexer(Indexer):
     def precheck(self) -> bool:
         try:
             # Attempt to retrieve a list of spaces with limit=1.
             # This should only succeed if all creds are valid
             with self.connection_config.get_client() as client:

unstructured_ingest/processes/connectors/databricks/volumes_aws.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_aws"
 class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint",
+        description="The Databricks account ID for the Databricks accounts endpoint",
     )
     profile: Optional[str] = None
     token: Optional[str] = Field(

unstructured_ingest/processes/connectors/databricks/volumes_azure.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_azure"
 class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint.",
+        description="The Databricks account ID for the Databricks accounts endpoint.",
     )
     profile: Optional[str] = None
     azure_workspace_resource_id: Optional[str] = Field(
@@ -47,7 +47,7 @@ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
     )
     azure_environment: Optional[str] = Field(
         default=None,
-        description="The Azure environment type for a " "specific set of API endpoints",
+        description="The Azure environment type for a specific set of API endpoints",
         examples=["Public", "UsGov", "China", "Germany"],
     )

unstructured_ingest/processes/connectors/databricks/volumes_gcp.py CHANGED Viewed

@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_gcp"
 class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
     account_id: Optional[str] = Field(
         default=None,
-        description="The Databricks account ID for the Databricks " "accounts endpoint.",
+        description="The Databricks account ID for the Databricks accounts endpoint.",
     )
     profile: Optional[str] = None
     google_credentials: Optional[str] = None

unstructured_ingest/processes/connectors/databricks/volumes_table.py CHANGED Viewed

@@ -166,8 +166,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
             logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
             cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
             logger.debug(
-                f"migrating content from {catalog_path} to "
-                f"table {self.upload_config.table_name}"
+                f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
             )
             data = get_json_data(path=path)
             columns = data[0].keys()

unstructured_ingest/processes/connectors/delta_table.py CHANGED Viewed

@@ -181,6 +181,7 @@ class DeltaTableUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df, file_data=file_data)
+    @requires_dependencies(["pandas"], extras="delta-table")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/duckdb/base.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any
 from unstructured_ingest.data_types.file_data import FileData
 from unstructured_ingest.interfaces import UploadStager
-from unstructured_ingest.utils.data_prep import get_data, get_enhanced_element_id, write_data
+from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 _COLUMNS = (
@@ -81,7 +81,7 @@ class BaseDuckDBUploadStager(UploadStager):
     ) -> Path:
         import pandas as pd
-        elements_contents = get_data(path=elements_filepath)
+        elements_contents = get_json_data(path=elements_filepath)
         output_filename_suffix = Path(elements_filepath).suffix
         output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
         output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)

unstructured_ingest/processes/connectors/duckdb/duckdb.py CHANGED Viewed

@@ -67,9 +67,8 @@ class DuckDBConnectionConfig(ConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
-        with self.get_client() as client:
-            with client.cursor() as cursor:
-                yield cursor
+        with self.get_client() as client, client.cursor() as cursor:
+            yield cursor
 class DuckDBUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class DuckDBUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="duckdb")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df)

unstructured_ingest/processes/connectors/duckdb/motherduck.py CHANGED Viewed

@@ -66,9 +66,8 @@ class MotherDuckConnectionConfig(ConnectionConfig):
     @contextmanager
     def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
-        with self.get_client() as client:
-            with client.cursor() as cursor:
-                yield cursor
+        with self.get_client() as client, client.cursor() as cursor:
+            yield cursor
 class MotherDuckUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class MotherDuckUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.upload_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="duckdb")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path)
         self.upload_dataframe(df=df)

unstructured_ingest/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -134,9 +134,11 @@ class S3Indexer(FsspecIndexer):
         version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
         metadata: dict[str, str] = {}
-        with contextlib.suppress(AttributeError):
-            with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
-                metadata = client.metadata(path=path)
+        with (
+            contextlib.suppress(AttributeError),
+            self.connection_config.get_client(protocol=self.index_config.protocol) as client,
+        ):
+            metadata = client.metadata(path=path)
         record_locator = {
             "protocol": self.index_config.protocol,
             "remote_file_path": self.index_config.remote_url,

unstructured_ingest/processes/connectors/gitlab.py CHANGED Viewed

@@ -230,8 +230,7 @@ class GitLabDownloader(Downloader):
         download_path = self.get_download_path(file_data=file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/google_drive.py CHANGED Viewed

@@ -334,7 +334,6 @@ class GoogleDriveIndexer(Indexer):
         recursive: bool = False,
         previous_path: Optional[str] = None,
     ) -> list[dict]:
         fields_input = "nextPageToken, files({})".format(",".join(self.fields))
         q = f"'{object_id}' in parents"
         # Filter by extension but still include any directories
@@ -394,7 +393,6 @@ class GoogleDriveIndexer(Indexer):
         if not self.is_dir(root_info):
             data = [self.map_file_data(root_info)]
         else:
             file_contents = self.get_paginated_results(
                 files_client=files_client,
                 object_id=object_id,

unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py CHANGED Viewed

@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
-import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.data_types.file_data import FileData
@@ -29,6 +28,7 @@ from unstructured_ingest.utils.data_prep import get_data_df
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
+    from pandas import DataFrame
     from pyarrow import Table as ArrowTable
     from pyiceberg.catalog.rest import RestCatalog
     from pyiceberg.table import Table, Transaction
@@ -96,14 +96,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
             return UserAuthError(e)
         if 400 <= response_code < 500:
             logger.error(
-                f"Request to {url} failed"
-                f"in IBM watsonx.data connector, status code {response_code}"
+                f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
             )
             return UserError(e)
         if response_code > 500:
             logger.error(
-                f"Request to {url} failed"
-                f"in IBM watsonx.data connector, status code {response_code}"
+                f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
             )
             return ProviderError(e)
         logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
@@ -217,7 +215,7 @@ class IbmWatsonxUploader(SQLUploader):
         return self.upload_config.record_id_key in self.get_table_columns()
     @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
-    def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
+    def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
         import pyarrow as pa
         # Iceberg will automatically fill missing columns with nulls
@@ -277,16 +275,20 @@ class IbmWatsonxUploader(SQLUploader):
         except Exception as e:
             raise ProviderError(f"Failed to upload data to table: {e}")
-    def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
+    def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
         data_table = self._df_to_arrow_table(df)
         with self.get_table() as table:
             self.upload_data_table(table, data_table, file_data)
+    @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        import pandas as pd
         df = pd.DataFrame(data)
         self.upload_dataframe(df=df, file_data=file_data)
+    @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         df = get_data_df(path=path)
         self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/kdbai.py CHANGED Viewed

@@ -141,6 +141,7 @@ class KdbaiUploader(Uploader):
         df = pd.DataFrame(data=data)
         self.process_dataframe(df=df)
+    @requires_dependencies(["pandas"], extras="kdbai")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         data = get_data_df(path=path)
         self.process_dataframe(df=data)

unstructured_ingest/processes/connectors/outlook.py CHANGED Viewed

@@ -199,8 +199,7 @@ class OutlookDownloader(Downloader):
         download_path = self.get_download_path(file_data)
         if download_path is None:
             logger.error(
-                "Generated download path is None, source_identifiers might be missing"
-                "from FileData."
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
             )
             raise ValueError("Generated invalid download path.")

unstructured_ingest/processes/connectors/pinecone.py CHANGED Viewed

@@ -227,7 +227,6 @@ class PineconeUploader(VectorDBUploader):
         self.connection_config.index_name = index_name
         if not self.index_exists(index_name):
             logger.info(f"creating pinecone index {index_name}")
             pc = self.connection_config.get_client()

unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.7.1py3-none-any.whl → 1.0.1py3-none-any.whl