PyPI - unstructured-ingest - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show

unstructured_ingest/v2/processes/connectors/gitlab.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional
@@ -82,16 +83,18 @@ class GitLabConnectionConfig(ConnectionConfig):
     @SourceConnectionError.wrap
     @requires_dependencies(["gitlab"], extras="gitlab")
-    def get_client(self) -> "Gitlab":
+    @contextmanager
+    def get_client(self) -> Generator["Gitlab", None, None]:
         from gitlab import Gitlab
         logger.info(f"Connection to GitLab: {self.base_url!r}")
-        gitlab = Gitlab(
+        with Gitlab(
             self.base_url, private_token=self.access_config.get_secret_value().access_token
-        )
-        return gitlab
+        ) as client:
+            yield client
-    def get_project(self) -> "Project":
+    @contextmanager
+    def get_project(self) -> Generator["Project", None, None]:
         """Retrieves the specified GitLab project using the configured base URL and access token.
         Returns:
@@ -101,13 +104,12 @@ class GitLabConnectionConfig(ConnectionConfig):
             SourceConnectionError: If the GitLab API connection fails.
             gitlab.exceptions.GitlabGetError: If the project is not found.
         """
-        gitlab = self.get_client()
+        with self.get_client() as client:
+            logger.info(f"Accessing Project: '{self.repo_path}'")
+            project = client.projects.get(self.repo_path)
-        logger.info(f"Accessing Project: '{self.repo_path}'")
-        project = gitlab.projects.get(self.repo_path)
-        logger.info(f"Successfully accessed project '{self.repo_path}'")
-        return project
+            logger.info(f"Successfully accessed project '{self.repo_path}'")
+            yield project
 class GitLabIndexerConfig(IndexerConfig):
@@ -144,11 +146,11 @@ class GitLabIndexer(Indexer):
         """
         try:
-            gitlab = self.connection_config.get_client()
-            if self.connection_config.access_config.get_secret_value().access_token is not None:
-                gitlab.auth()
-            else:
-                gitlab.projects.get(self.connection_config.repo_path)
+            with self.connection_config.get_client() as client:
+                if self.connection_config.access_config.get_secret_value().access_token is not None:
+                    client.auth()
+                else:
+                    client.projects.get(self.connection_config.repo_path)
         except Exception as e:
             logger.error(f"Failed to validate connection: {e}", exc_info=True)
@@ -168,17 +170,16 @@ class GitLabIndexer(Indexer):
             FileData: A generator that yields `FileData` objects representing each file (blob)
             in the repository.
         """
-        project = self.connection_config.get_project()
-        ref = self.index_config.git_branch or project.default_branch
-        files = project.repository_tree(
-            path=str(self.index_config.path),
-            ref=ref,
-            recursive=self.index_config.recursive,
-            iterator=True,
-            all=True,
-        )
+        with self.connection_config.get_project() as project:
+            ref = self.index_config.git_branch or project.default_branch
+            files = project.repository_tree(
+                path=str(self.index_config.path),
+                ref=ref,
+                recursive=self.index_config.recursive,
+                iterator=True,
+                all=True,
+            )
         for file in files:
             relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
@@ -250,12 +251,12 @@ class GitLabDownloader(Downloader):
         ref = file_data.metadata.record_locator["ref"]
         path = file_data.metadata.record_locator["file_path"]
-        project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
         download_path.parent.mkdir(exist_ok=True, parents=True)
-        with open(download_path, "wb") as file:
-            file.write(project_file.decode())
+        with self.connection_config.get_project() as project:
+            project_file = project.files.get(file_path=path, ref=ref)
+            with open(download_path, "wb") as file:
+                file.write(project_file.decode())
 gitlab_source_entry = SourceRegistryEntry(

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import io
 import json
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
@@ -74,7 +75,8 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
     access_config: Secret[GoogleDriveAccessConfig]
     @requires_dependencies(["googleapiclient"], extras="google-drive")
-    def get_files_service(self) -> "GoogleAPIResource":
+    @contextmanager
+    def get_client(self) -> Generator["GoogleAPIResource", None, None]:
         from google.auth import exceptions
         from google.oauth2 import service_account
         from googleapiclient.discovery import build
@@ -86,8 +88,8 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
         try:
             creds = service_account.Credentials.from_service_account_info(key_data)
             service = build("drive", "v3", credentials=creds)
-            return service.files()
+            with service.files() as client:
+                yield client
         except HttpError as exc:
             raise ValueError(f"{exc.reason}")
         except exceptions.DefaultCredentialsError:
@@ -132,7 +134,7 @@ class GoogleDriveIndexer(Indexer):
     def precheck(self) -> None:
         try:
-            self.connection_config.get_files_service()
+            self.connection_config.get_client()
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -266,13 +268,14 @@ class GoogleDriveIndexer(Indexer):
         return data
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
-        for f in self.get_files(
-            files_client=self.connection_config.get_files_service(),
-            object_id=self.connection_config.drive_id,
-            recursive=self.index_config.recursive,
-            extensions=self.index_config.extensions,
-        ):
-            yield f
+        with self.connection_config.get_client() as client:
+            for f in self.get_files(
+                files_client=client,
+                object_id=self.connection_config.drive_id,
+                recursive=self.index_config.recursive,
+                extensions=self.index_config.extensions,
+            ):
+                yield f
 class GoogleDriveDownloaderConfig(DownloaderConfig):
@@ -309,30 +312,30 @@ class GoogleDriveDownloader(Downloader):
         logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
         mime_type = file_data.additional_metadata["mimeType"]
         record_id = file_data.identifier
-        files_client = self.connection_config.get_files_service()
-        if mime_type.startswith("application/vnd.google-apps"):
-            export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
-                self.meta.get("mimeType"),  # type: ignore
-            )
-            if not export_mime:
-                raise TypeError(
-                    f"File not supported. Name: {file_data.source_identifiers.filename} "
-                    f"ID: {record_id} "
-                    f"MimeType: {mime_type}"
+        with self.connection_config.get_client() as client:
+            if mime_type.startswith("application/vnd.google-apps"):
+                export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
+                    self.meta.get("mimeType"),  # type: ignore
                 )
-            request = files_client.export_media(
-                fileId=record_id,
-                mimeType=export_mime,
-            )
-        else:
-            request = files_client.get_media(fileId=record_id)
+                if not export_mime:
+                    raise TypeError(
+                        f"File not supported. Name: {file_data.source_identifiers.filename} "
+                        f"ID: {record_id} "
+                        f"MimeType: {mime_type}"
+                    )
+                request = client.export_media(
+                    fileId=record_id,
+                    mimeType=export_mime,
+                )
+            else:
+                request = client.get_media(fileId=record_id)
         file_contents = io.BytesIO()
         downloader = MediaIoBaseDownload(file_contents, request)
         downloaded = self._get_content(downloader=downloader)
         if not downloaded or not file_contents:
-            return []
+            raise SourceConnectionError("nothing found to download")
         return self._write_file(file_data=file_data, file_contents=file_contents)

unstructured_ingest/v2/processes/connectors/kafka/kafka.py CHANGED Viewed

@@ -257,8 +257,6 @@ class KafkaUploader(Uploader, ABC):
         if failed_producer:
             raise KafkaException("failed to produce all messages in batch")
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        with path.open("r") as elements_file:
-            elements = json.load(elements_file)
-        for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        for element_batch in batch_generator(data, batch_size=self.upload_config.batch_size):
             self.produce_batch(elements=element_batch)

unstructured_ingest/v2/processes/connectors/kdbai.py CHANGED Viewed

@@ -1,14 +1,13 @@
-import json
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Generator, Optional
-import numpy as np
 import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.error import DestinationConnectionError
-from unstructured_ingest.utils.data_prep import flatten_dict
+from unstructured_ingest.utils.data_prep import flatten_dict, get_data_df, split_dataframe
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
@@ -48,12 +47,19 @@ class KdbaiConnectionConfig(ConnectionConfig):
     )
     @requires_dependencies(["kdbai_client"], extras="kdbai")
-    def get_session(self) -> "Session":
+    @contextmanager
+    def get_client(self) -> Generator["Session", None, None]:
         from kdbai_client import Session
-        return Session(
-            api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
-        )
+        session = None
+        try:
+            session = Session(
+                api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
+            )
+            yield session
+        finally:
+            if session:
+                session.close()
 class KdbaiUploadStagerConfig(UploadStagerConfig):
@@ -64,38 +70,19 @@ class KdbaiUploadStagerConfig(UploadStagerConfig):
 class KdbaiUploadStager(UploadStager):
     upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
-    def run(
-        self,
-        elements_filepath: Path,
-        file_data: FileData,
-        output_dir: Path,
-        output_filename: str,
-        **kwargs: Any,
-    ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents = json.load(elements_file)
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        data = []
-        for element in elements_contents:
-            data.append(
-                {
-                    "id": get_enhanced_element_id(element_dict=element, file_data=file_data),
-                    "element_id": element.get("element_id"),
-                    "document": element.pop("text", None),
-                    "embeddings": element.get("embeddings"),
-                    "metadata": flatten_dict(
-                        dictionary=element.get("metadata"),
-                        flatten_lists=True,
-                        remove_none=True,
-                    ),
-                }
-            )
-        logger.debug(f"writing {len(data)} elements to {output_path}")
-        with output_path.open("w") as output_file:
-            json.dump(data, output_file, indent=2)
-        return output_path
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        data = element_dict.copy()
+        return {
+            "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
+            "element_id": data.get("element_id"),
+            "document": data.pop("text", None),
+            "embeddings": data.get("embeddings"),
+            "metadata": flatten_dict(
+                dictionary=data.get("metadata"),
+                flatten_lists=True,
+                remove_none=True,
+            ),
+        }
 class KdbaiUploaderConfig(UploaderConfig):
@@ -119,50 +106,37 @@ class KdbaiUploader(Uploader):
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def get_database(self) -> "Database":
-        session: Session = self.connection_config.get_session()
-        db = session.database(self.upload_config.database_name)
-        return db
+    @contextmanager
+    def get_database(self) -> Generator["Database", None, None]:
+        with self.connection_config.get_client() as client:
+            db = client.database(self.upload_config.database_name)
+            yield db
-    def get_table(self) -> "Table":
-        db = self.get_database()
-        table = db.table(self.upload_config.table_name)
-        return table
+    @contextmanager
+    def get_table(self) -> Generator["Table", None, None]:
+        with self.get_database() as db:
+            table = db.table(self.upload_config.table_name)
+            yield table
     def upsert_batch(self, batch: pd.DataFrame):
-        table = self.get_table()
-        table.insert(batch)
+        with self.get_table() as table:
+            table.insert(batch)
     def process_dataframe(self, df: pd.DataFrame):
         logger.debug(
             f"uploading {len(df)} entries to {self.connection_config.endpoint} "
             f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
         )
-        for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
+        for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
             self.upsert_batch(batch=batch_df)
-    def process_csv(self, csv_paths: list[Path]):
-        logger.debug(f"uploading content from {len(csv_paths)} csv files")
-        df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
-        self.process_dataframe(df=df)
-    def process_json(self, json_paths: list[Path]):
-        logger.debug(f"uploading content from {len(json_paths)} json files")
-        all_records = []
-        for p in json_paths:
-            with open(p) as json_file:
-                all_records.extend(json.load(json_file))
-        df = pd.DataFrame(data=all_records)
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        df = pd.DataFrame(data=data)
         self.process_dataframe(df=df)
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        if path.suffix == ".csv":
-            self.process_csv(csv_paths=[path])
-        elif path.suffix == ".json":
-            self.process_json(json_paths=[path])
-        else:
-            raise ValueError(f"Unsupported file type, must be json or csv file: {path}")
+        data = get_data_df(path=path)
+        self.process_dataframe(df=data)
 kdbai_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py CHANGED Viewed

@@ -41,14 +41,11 @@ class LanceDBConnectionConfig(ConnectionConfig, ABC):
     async def get_async_connection(self) -> AsyncGenerator["AsyncConnection", None]:
         import lancedb
-        connection = await lancedb.connect_async(
+        with await lancedb.connect_async(
             self.uri,
             storage_options=self.get_storage_options(),
-        )
-        try:
+        ) as connection:
             yield connection
-        finally:
-            connection.close()
 class LanceDBRemoteConnectionConfig(LanceDBConnectionConfig):
@@ -85,8 +82,8 @@ class LanceDBUploadStager(UploadStager):
         df = pd.DataFrame(
             [
-                self._conform_element_contents(element_contents, file_data)
-                for element_contents in elements_contents
+                self.conform_dict(element_dict=element_dict, file_data=file_data)
+                for element_dict in elements_contents
             ]
         )
@@ -95,11 +92,12 @@ class LanceDBUploadStager(UploadStager):
         return output_path
-    def _conform_element_contents(self, element: dict, file_data: FileData) -> dict:
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        data = element_dict.copy()
         return {
-            "vector": element.pop("embeddings", None),
+            "vector": data.pop("embeddings", None),
             RECORD_ID_LABEL: file_data.identifier,
-            **flatten_dict(element, separator="-"),
+            **flatten_dict(data, separator="-"),
         }

unstructured_ingest/v2/processes/connectors/local.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import glob
+import json
 import shutil
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -175,7 +176,7 @@ class LocalUploader(Uploader):
     def is_async(self) -> bool:
         return False
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+    def get_destination_path(self, file_data: FileData) -> Path:
         if source_identifiers := file_data.source_identifiers:
             rel_path = (
                 source_identifiers.relative_path[1:]
@@ -188,7 +189,17 @@ class LocalUploader(Uploader):
             )
         else:
             final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
-        Path(final_path).parent.mkdir(parents=True, exist_ok=True)
+        final_path = Path(final_path)
+        final_path.parent.mkdir(parents=True, exist_ok=True)
+        return final_path
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        final_path = self.get_destination_path(file_data=file_data)
+        with final_path.open("w") as f:
+            json.dump(data, f)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        final_path = self.get_destination_path(file_data=file_data)
         logger.debug(f"copying file from {path} to {final_path}")
         shutil.copy(src=str(path), dst=str(final_path))

unstructured_ingest/v2/processes/connectors/milvus.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import json
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, Union
-import pandas as pd
 from dateutil import parser
 from pydantic import Field, Secret
@@ -16,7 +14,6 @@ from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
     FileData,
-    UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
@@ -59,10 +56,17 @@ class MilvusConnectionConfig(ConnectionConfig):
         return connection_config_dict
     @requires_dependencies(["pymilvus"], extras="milvus")
-    def get_client(self) -> "MilvusClient":
+    @contextmanager
+    def get_client(self) -> Generator["MilvusClient", None, None]:
         from pymilvus import MilvusClient
-        return MilvusClient(**self.get_connection_kwargs())
+        client = None
+        try:
+            client = MilvusClient(**self.get_connection_kwargs())
+            yield client
+        finally:
+            if client:
+                client.close()
 class MilvusUploadStagerConfig(UploadStagerConfig):
@@ -91,8 +95,8 @@ class MilvusUploadStager(UploadStager):
             pass
         return parser.parse(date_string).timestamp()
-    def conform_dict(self, data: dict, file_data: FileData) -> dict:
-        working_data = data.copy()
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        working_data = element_dict.copy()
         if self.upload_stager_config.flatten_metadata and (
             metadata := working_data.pop("metadata", None)
         ):
@@ -134,29 +138,6 @@ class MilvusUploadStager(UploadStager):
         working_data[RECORD_ID_LABEL] = file_data.identifier
         return working_data
-    def run(
-        self,
-        elements_filepath: Path,
-        file_data: FileData,
-        output_dir: Path,
-        output_filename: str,
-        **kwargs: Any,
-    ) -> Path:
-        with open(elements_filepath) as elements_file:
-            elements_contents: list[dict[str, Any]] = json.load(elements_file)
-        new_content = [
-            self.conform_dict(data=element, file_data=file_data) for element in elements_contents
-        ]
-        output_filename_path = Path(output_filename)
-        if output_filename_path.suffix == ".json":
-            output_path = Path(output_dir) / output_filename_path
-        else:
-            output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        with output_path.open("w") as output_file:
-            json.dump(new_content, output_file, indent=2)
-        return output_path
 class MilvusUploaderConfig(UploaderConfig):
     db_name: Optional[str] = Field(default=None, description="Milvus database name")
@@ -183,22 +164,10 @@ class MilvusUploader(Uploader):
     @contextmanager
     def get_client(self) -> Generator["MilvusClient", None, None]:
-        client = self.connection_config.get_client()
-        if db_name := self.upload_config.db_name:
-            client.using_database(db_name=db_name)
-        try:
+        with self.connection_config.get_client() as client:
+            if db_name := self.upload_config.db_name:
+                client.using_database(db_name=db_name)
             yield client
-        finally:
-            client.close()
-    def upload(self, content: UploadContent) -> None:
-        file_extension = content.path.suffix
-        if file_extension == ".json":
-            self.upload_json(content=content)
-        elif file_extension == ".csv":
-            self.upload_csv(content=content)
-        else:
-            raise ValueError(f"Unsupported file extension: {file_extension}")
     def delete_by_record_id(self, file_data: FileData) -> None:
         logger.info(
@@ -233,19 +202,9 @@ class MilvusUploader(Uploader):
                 err_count = res["err_count"]
                 raise WriteError(f"failed to upload {err_count} docs")
-    def upload_csv(self, content: UploadContent) -> None:
-        df = pd.read_csv(content.path)
-        data = df.to_dict(orient="records")
-        self.insert_results(data=data)
-    def upload_json(self, content: UploadContent) -> None:
-        with content.path.open("r") as file:
-            data: list[dict] = json.load(file)
-        self.insert_results(data=data)
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
         self.delete_by_record_id(file_data=file_data)
-        self.upload(content=UploadContent(path=path, file_data=file_data))
+        self.insert_results(data=data)
 milvus_destination_entry = DestinationRegistryEntry(

unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl