PyPI - unstructured-ingest - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (64) hide show

unstructured_ingest/v2/processes/connectors/chroma.py CHANGED Viewed

@@ -1,7 +1,5 @@
-import json
 from dataclasses import dataclass, field
 from datetime import date, datetime
-from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Any, Optional
 from dateutil import parser
@@ -42,7 +40,6 @@ class ChromaAccessConfig(AccessConfig):
 class ChromaConnectionConfig(ConnectionConfig):
-    collection_name: str = Field(description="The name of the Chroma collection to write into.")
     access_config: Secret[ChromaAccessConfig] = Field(
         default=ChromaAccessConfig(), validate_default=True
     )
@@ -62,6 +59,32 @@ class ChromaConnectionConfig(ConnectionConfig):
     )
     connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
+    @requires_dependencies(["chromadb"], extras="chroma")
+    def get_client(self) -> "Client":
+        import chromadb
+        access_config = self.access_config.get_secret_value()
+        if path := self.path:
+            return chromadb.PersistentClient(
+                path=path,
+                settings=access_config.settings,
+                tenant=self.tenant,
+                database=self.database,
+            )
+        elif (host := self.host) and (port := self.port):
+            return chromadb.HttpClient(
+                host=host,
+                port=str(port),
+                ssl=self.ssl,
+                headers=access_config.headers,
+                settings=access_config.settings,
+                tenant=self.tenant,
+                database=self.database,
+            )
+        else:
+            raise ValueError("Chroma connector requires either path or host and port to be set.")
 class ChromaUploadStagerConfig(UploadStagerConfig):
     pass
@@ -82,11 +105,11 @@ class ChromaUploadStager(UploadStager):
             logger.debug(f"date {date_string} string not a timestamp: {e}")
         return parser.parse(date_string)
-    @staticmethod
-    def conform_dict(data: dict, file_data: FileData) -> dict:
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         """
         Prepares dictionary in the format that Chroma requires
         """
+        data = element_dict.copy()
         return {
             "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
             "embedding": data.pop("embeddings", None),
@@ -94,26 +117,9 @@ class ChromaUploadStager(UploadStager):
             "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
         }
-    def run(
-        self,
-        elements_filepath: Path,
-        file_data: FileData,
-        output_dir: Path,
-        output_filename: str,
-        **kwargs: Any,
-    ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents = json.load(elements_file)
-        conformed_elements = [
-            self.conform_dict(data=element, file_data=file_data) for element in elements_contents
-        ]
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
-        with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file)
-        return output_path
 class ChromaUploaderConfig(UploaderConfig):
+    collection_name: str = Field(description="The name of the Chroma collection to write into.")
     batch_size: int = Field(default=100, description="Number of records per batch")
@@ -125,37 +131,11 @@ class ChromaUploader(Uploader):
     def precheck(self) -> None:
         try:
-            self.create_client()
+            self.connection_config.get_client()
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    @requires_dependencies(["chromadb"], extras="chroma")
-    def create_client(self) -> "Client":
-        import chromadb
-        access_config = self.connection_config.access_config.get_secret_value()
-        if self.connection_config.path:
-            return chromadb.PersistentClient(
-                path=self.connection_config.path,
-                settings=access_config.settings,
-                tenant=self.connection_config.tenant,
-                database=self.connection_config.database,
-            )
-        elif self.connection_config.host and self.connection_config.port:
-            return chromadb.HttpClient(
-                host=self.connection_config.host,
-                port=self.connection_config.port,
-                ssl=self.connection_config.ssl,
-                headers=access_config.headers,
-                settings=access_config.settings,
-                tenant=self.connection_config.tenant,
-                database=self.connection_config.database,
-            )
-        else:
-            raise ValueError("Chroma connector requires either path or host and port to be set.")
     @DestinationConnectionError.wrap
     def upsert_batch(self, collection, batch):
@@ -189,19 +169,16 @@ class ChromaUploader(Uploader):
         )
         return chroma_dict
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        with path.open("r") as file:
-            elements_dict = json.load(file)
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
         logger.info(
-            f"writing {len(elements_dict)} objects to destination "
-            f"collection {self.connection_config.collection_name} "
+            f"writing {len(data)} objects to destination "
+            f"collection {self.upload_config.collection_name} "
             f"at {self.connection_config.host}",
         )
-        client = self.create_client()
+        client = self.connection_config.get_client()
-        collection = client.get_or_create_collection(name=self.connection_config.collection_name)
-        for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
+        collection = client.get_or_create_collection(name=self.upload_config.collection_name)
+        for chunk in batch_generator(data, self.upload_config.batch_size):
             self.upsert_batch(collection, self.prepare_chroma_list(chunk))

unstructured_ingest/v2/processes/connectors/couchbase.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import hashlib
-import json
 import sys
 import time
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from datetime import timedelta
 from pathlib import Path
@@ -65,7 +65,8 @@ class CouchbaseConnectionConfig(ConnectionConfig):
     access_config: Secret[CouchbaseAccessConfig]
     @requires_dependencies(["couchbase"], extras="couchbase")
-    def connect_to_couchbase(self) -> "Cluster":
+    @contextmanager
+    def get_client(self) -> Generator["Cluster", None, None]:
         from couchbase.auth import PasswordAuthenticator
         from couchbase.cluster import Cluster
         from couchbase.options import ClusterOptions
@@ -73,9 +74,14 @@ class CouchbaseConnectionConfig(ConnectionConfig):
         auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
         options = ClusterOptions(auth)
         options.apply_profile("wan_development")
-        cluster = Cluster(self.connection_string, options)
-        cluster.wait_until_ready(timedelta(seconds=5))
-        return cluster
+        cluster = None
+        try:
+            cluster = Cluster(self.connection_string, options)
+            cluster.wait_until_ready(timedelta(seconds=5))
+            yield cluster
+        finally:
+            if cluster:
+                cluster.close()
 class CouchbaseUploadStagerConfig(UploadStagerConfig):
@@ -88,32 +94,16 @@ class CouchbaseUploadStager(UploadStager):
         default_factory=lambda: CouchbaseUploadStagerConfig()
     )
-    def run(
-        self,
-        elements_filepath: Path,
-        output_dir: Path,
-        output_filename: str,
-        **kwargs: Any,
-    ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents = json.load(elements_file)
-        output_elements = []
-        for element in elements_contents:
-            new_doc = {
-                element["element_id"]: {
-                    "embedding": element.get("embeddings", None),
-                    "text": element.get("text", None),
-                    "metadata": element.get("metadata", None),
-                    "type": element.get("type", None),
-                }
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        data = element_dict.copy()
+        return {
+            data["element_id"]: {
+                "embedding": data.get("embeddings", None),
+                "text": data.get("text", None),
+                "metadata": data.get("metadata", None),
+                "type": data.get("type", None),
             }
-            output_elements.append(new_doc)
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
-        with open(output_path, "w") as output_file:
-            json.dump(output_elements, output_file)
-        return output_path
+        }
 class CouchbaseUploaderConfig(UploaderConfig):
@@ -128,26 +118,26 @@ class CouchbaseUploader(Uploader):
     def precheck(self) -> None:
         try:
-            self.connection_config.connect_to_couchbase()
+            self.connection_config.get_client()
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        with path.open("r") as file:
-            elements_dict = json.load(file)
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
         logger.info(
-            f"writing {len(elements_dict)} objects to destination "
+            f"writing {len(data)} objects to destination "
             f"bucket, {self.connection_config.bucket} "
             f"at {self.connection_config.connection_string}",
         )
-        cluster = self.connection_config.connect_to_couchbase()
-        bucket = cluster.bucket(self.connection_config.bucket)
-        scope = bucket.scope(self.connection_config.scope)
-        collection = scope.collection(self.connection_config.collection)
+        with self.connection_config.get_client() as client:
+            bucket = client.bucket(self.connection_config.bucket)
+            scope = bucket.scope(self.connection_config.scope)
+            collection = scope.collection(self.connection_config.collection)
-        for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
-            collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
+            for chunk in batch_generator(data, self.upload_config.batch_size):
+                collection.upsert_multi(
+                    {doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
+                )
 class CouchbaseIndexerConfig(IndexerConfig):
@@ -162,7 +152,7 @@ class CouchbaseIndexer(Indexer):
     def precheck(self) -> None:
         try:
-            self.connection_config.connect_to_couchbase()
+            self.connection_config.get_client()
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -180,10 +170,10 @@ class CouchbaseIndexer(Indexer):
         attempts = 0
         while attempts < max_attempts:
             try:
-                cluster = self.connection_config.connect_to_couchbase()
-                result = cluster.query(query)
-                document_ids = [row["id"] for row in result]
-                return document_ids
+                with self.connection_config.get_client() as client:
+                    result = client.query(query)
+                    document_ids = [row["id"] for row in result]
+                    return document_ids
             except Exception as e:
                 attempts += 1
                 time.sleep(3)
@@ -294,13 +284,13 @@ class CouchbaseDownloader(Downloader):
         bucket_name: str = file_data.additional_metadata["bucket"]
         ids: list[str] = file_data.additional_metadata["ids"]
-        cluster = self.connection_config.connect_to_couchbase()
-        bucket = cluster.bucket(bucket_name)
-        scope = bucket.scope(self.connection_config.scope)
-        collection = scope.collection(self.connection_config.collection)
+        with self.connection_config.get_client() as client:
+            bucket = client.bucket(bucket_name)
+            scope = bucket.scope(self.connection_config.scope)
+            collection = scope.collection(self.connection_config.collection)
-        download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
-        return list(download_resp)
+            download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
+            return list(download_resp)
     def process_doc_id(self, doc_id, collection, bucket_name, file_data):
         result = collection.get(doc_id)

unstructured_ingest/v2/processes/connectors/delta_table.py CHANGED Viewed

@@ -11,6 +11,7 @@ import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.data_prep import get_data_df
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.table import convert_to_pandas_dataframe
 from unstructured_ingest.v2.interfaces import (
@@ -28,6 +29,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
 CONNECTOR_TYPE = "delta_table"
+@requires_dependencies(["deltalake"], extras="delta-table")
 def write_deltalake_with_error_handling(queue, **kwargs):
     from deltalake.writer import write_deltalake
@@ -136,39 +138,7 @@ class DeltaTableUploader(Uploader):
                 logger.error(f"failed to validate connection: {e}", exc_info=True)
                 raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def process_csv(self, csv_paths: list[Path]) -> pd.DataFrame:
-        logger.debug(f"uploading content from {len(csv_paths)} csv files")
-        df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
-        return df
-    def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
-        logger.debug(f"uploading content from {len(json_paths)} json files")
-        all_records = []
-        for p in json_paths:
-            with open(p) as json_file:
-                all_records.extend(json.load(json_file))
-        return pd.DataFrame(data=all_records)
-    def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
-        logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
-        df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
-        return df
-    def read_dataframe(self, path: Path) -> pd.DataFrame:
-        if path.suffix == ".csv":
-            return self.process_csv(csv_paths=[path])
-        elif path.suffix == ".json":
-            return self.process_json(json_paths=[path])
-        elif path.suffix == ".parquet":
-            return self.process_parquet(parquet_paths=[path])
-        else:
-            raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
-    @requires_dependencies(["deltalake"], extras="delta-table")
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        df = self.read_dataframe(path)
+    def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
         updated_upload_path = os.path.join(
             self.connection_config.table_uri, file_data.source_identifiers.relative_path
         )
@@ -203,6 +173,14 @@ class DeltaTableUploader(Uploader):
             logger.error(f"Exception occurred in write_deltalake: {error_message}")
             raise RuntimeError(f"Error in write_deltalake: {error_message}")
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        df = pd.DataFrame(data=data)
+        self.upload_dataframe(df=df, file_data=file_data)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        df = get_data_df(path)
+        self.upload_dataframe(df=df, file_data=file_data)
 delta_table_destination_entry = DestinationRegistryEntry(
     connection_config=DeltaTableConnectionConfig,

unstructured_ingest/v2/processes/connectors/duckdb/base.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import json
-import uuid
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
@@ -7,6 +5,7 @@ from typing import Any
 import pandas as pd
 from unstructured_ingest.v2.interfaces import FileData, UploadStager
+from unstructured_ingest.v2.utils import get_enhanced_element_id
 _COLUMNS = (
     "id",
@@ -56,6 +55,22 @@ _COLUMNS = (
 @dataclass
 class BaseDuckDBUploadStager(UploadStager):
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        data = element_dict.copy()
+        metadata: dict[str, Any] = data.pop("metadata", {})
+        data_source = metadata.pop("data_source", {})
+        coordinates = metadata.pop("coordinates", {})
+        data.update(metadata)
+        data.update(data_source)
+        data.update(coordinates)
+        data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
+        # remove extraneous, not supported columns
+        data = {k: v for k, v in data.items() if k in _COLUMNS}
+        return data
     def run(
         self,
         elements_filepath: Path,
@@ -64,29 +79,14 @@ class BaseDuckDBUploadStager(UploadStager):
         output_filename: str,
         **kwargs: Any,
     ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents: list[dict] = json.load(elements_file)
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        output = []
-        for data in elements_contents:
-            metadata: dict[str, Any] = data.pop("metadata", {})
-            data_source = metadata.pop("data_source", {})
-            coordinates = metadata.pop("coordinates", {})
-            data.update(metadata)
-            data.update(data_source)
-            data.update(coordinates)
-            data["id"] = str(uuid.uuid4())
-            # remove extraneous, not supported columns
-            data = {k: v for k, v in data.items() if k in _COLUMNS}
-            output.append(data)
+        elements_contents = self.get_data(elements_filepath=elements_filepath)
+        output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
-        df = pd.DataFrame.from_dict(output)
+        output = [
+            self.conform_dict(element_dict=element_dict, file_data=file_data)
+            for element_dict in elements_contents
+        ]
+        df = pd.DataFrame(data=output)
         for column in filter(
             lambda x: x in df.columns,
@@ -94,6 +94,6 @@ class BaseDuckDBUploadStager(UploadStager):
         ):
             df[column] = df[column].apply(str)
-        with output_path.open("w") as output_file:
-            df.to_json(output_file, orient="records", lines=True)
+        data = df.to_dict(orient="records")
+        self.write_output(output_path=output_path, data=data)
         return output_path

unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py CHANGED Viewed

@@ -1,11 +1,13 @@
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any, Generator, Optional
 import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.data_prep import get_data_df
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -55,6 +57,20 @@ class DuckDBConnectionConfig(ConnectionConfig):
                 "through the `database` argument"
             )
+    @requires_dependencies(["duckdb"], extras="duckdb")
+    @contextmanager
+    def get_client(self) -> Generator["DuckDBConnection", None, None]:
+        import duckdb
+        with duckdb.connect(self.database) as client:
+            yield client
+    @contextmanager
+    def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
+        with self.get_client() as client:
+            with client.cursor() as cursor:
+                yield cursor
 class DuckDBUploadStagerConfig(UploadStagerConfig):
     pass
@@ -79,34 +95,27 @@ class DuckDBUploader(Uploader):
     def precheck(self) -> None:
         try:
-            cursor = self.connection().cursor()
-            cursor.execute("SELECT 1;")
-            cursor.close()
+            with self.connection_config.get_cursor() as cursor:
+                cursor.execute("SELECT 1;")
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    @property
-    def connection(self) -> Callable[[], "DuckDBConnection"]:
-        return self._make_duckdb_connection
+    def upload_dataframe(self, df: pd.DataFrame) -> None:
+        logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
-    @requires_dependencies(["duckdb"], extras="duckdb")
-    def _make_duckdb_connection(self) -> "DuckDBConnection":
-        import duckdb
-        return duckdb.connect(self.connection_config.database)
-    def upload_contents(self, path: Path) -> None:
-        df_elements = pd.read_json(path, orient="records", lines=True)
-        logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
-        with self.connection() as conn:
+        with self.connection_config.get_client() as conn:
             conn.query(
-                f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements"  # noqa: E501
+                f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df"  # noqa: E501
             )
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        df = pd.DataFrame(data=data)
+        self.upload_dataframe(df=df)
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        self.upload_contents(path=path)
+        df = get_data_df(path)
+        self.upload_dataframe(df=df)
 duckdb_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py CHANGED Viewed

@@ -1,12 +1,14 @@
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any, Generator, Optional
 import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
 from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.data_prep import get_data_df
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -27,13 +29,12 @@ CONNECTOR_TYPE = "motherduck"
 class MotherDuckAccessConfig(AccessConfig):
-    md_token: Optional[str] = Field(default=None, description="MotherDuck token")
+    md_token: str = Field(default=None, description="MotherDuck token")
 class MotherDuckConnectionConfig(ConnectionConfig):
     connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
-    database: Optional[str] = Field(
-        default=None,
+    database: str = Field(
         description="Database name. Name of the MotherDuck database.",
     )
     db_schema: Optional[str] = Field(
@@ -48,17 +49,26 @@ class MotherDuckConnectionConfig(ConnectionConfig):
         default=MotherDuckAccessConfig(), validate_default=True
     )
-    def __post_init__(self):
-        if self.database is None:
-            raise ValueError(
-                "A MotherDuck connection requires a database (string) to be passed "
-                "through the `database` argument"
-            )
-        if self.access_config.md_token is None:
-            raise ValueError(
-                "A MotherDuck connection requires a md_token (MotherDuck token) to be passed "
-                "using MotherDuckAccessConfig through the `access_config` argument"
-            )
+    @requires_dependencies(["duckdb"], extras="duckdb")
+    @contextmanager
+    def get_client(self) -> Generator["MotherDuckConnection", None, None]:
+        import duckdb
+        access_config = self.access_config.get_secret_value()
+        with duckdb.connect(
+            f"md:?motherduck_token={access_config.md_token}",
+            config={
+                "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
+            },
+        ) as conn:
+            conn.sql(f"USE {self.database}")
+            yield conn
+    @contextmanager
+    def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
+        with self.get_client() as client:
+            with client.cursor() as cursor:
+                yield cursor
 class MotherDuckUploadStagerConfig(UploadStagerConfig):
@@ -84,44 +94,27 @@ class MotherDuckUploader(Uploader):
     def precheck(self) -> None:
         try:
-            cursor = self.connection().cursor()
-            cursor.execute("SELECT 1;")
-            cursor.close()
+            with self.connection_config.get_cursor() as cursor:
+                cursor.execute("SELECT 1;")
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    @property
-    def connection(self) -> Callable[[], "MotherDuckConnection"]:
-        return self._make_motherduck_connection
-    @requires_dependencies(["duckdb"], extras="duckdb")
-    def _make_motherduck_connection(self) -> "MotherDuckConnection":
-        import duckdb
+    def upload_dataframe(self, df: pd.DataFrame) -> None:
+        logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
-        access_config = self.connection_config.access_config.get_secret_value()
-        conn = duckdb.connect(
-            f"md:?motherduck_token={access_config.md_token}",
-            config={
-                "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
-            },
-        )
-        conn.sql(f"USE {self.connection_config.database}")
-        return conn
-    def upload_contents(self, path: Path) -> None:
-        df_elements = pd.read_json(path, orient="records", lines=True)
-        logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
-        with self.connection() as conn:
+        with self.connection_config.get_client() as conn:
             conn.query(
-                f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements"  # noqa: E501
+                f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df"  # noqa: E501
             )
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        df = pd.DataFrame(data=data)
+        self.upload_dataframe(df=df)
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        self.upload_contents(path=path)
+        df = get_data_df(path)
+        self.upload_dataframe(df=df)
 motherduck_destination_entry = DestinationRegistryEntry(

unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.9py3-none-any.whl