PyPI - unstructured-ingest - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

unstructured-ingest 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (82) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/interfaces.py +1 -1
unstructured_ingest/cli/utils.py +1 -1
unstructured_ingest/connector/astradb.py +1 -1
unstructured_ingest/connector/biomed.py +4 -4
unstructured_ingest/connector/chroma.py +1 -1
unstructured_ingest/connector/databricks_volumes.py +2 -2
unstructured_ingest/connector/fsspec/box.py +1 -1
unstructured_ingest/connector/fsspec/fsspec.py +5 -5
unstructured_ingest/connector/git.py +1 -1
unstructured_ingest/connector/google_drive.py +4 -4
unstructured_ingest/connector/hubspot.py +1 -1
unstructured_ingest/connector/kafka.py +8 -8
unstructured_ingest/connector/local.py +1 -1
unstructured_ingest/connector/notion/helpers.py +4 -4
unstructured_ingest/connector/onedrive.py +3 -3
unstructured_ingest/connector/outlook.py +2 -2
unstructured_ingest/connector/pinecone.py +1 -1
unstructured_ingest/connector/sharepoint.py +8 -8
unstructured_ingest/connector/vectara.py +6 -6
unstructured_ingest/embed/__init__.py +17 -0
unstructured_ingest/embed/bedrock.py +70 -0
unstructured_ingest/embed/huggingface.py +73 -0
unstructured_ingest/embed/interfaces.py +36 -0
unstructured_ingest/embed/mixedbreadai.py +177 -0
unstructured_ingest/embed/octoai.py +63 -0
unstructured_ingest/embed/openai.py +61 -0
unstructured_ingest/embed/vertexai.py +88 -0
unstructured_ingest/embed/voyageai.py +69 -0
unstructured_ingest/interfaces.py +21 -11
unstructured_ingest/logger.py +1 -1
unstructured_ingest/pipeline/copy.py +1 -1
unstructured_ingest/pipeline/interfaces.py +2 -2
unstructured_ingest/pipeline/partition.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/pipeline/reformat/chunking.py +2 -2
unstructured_ingest/pipeline/reformat/embedding.py +4 -6
unstructured_ingest/pipeline/source.py +2 -2
unstructured_ingest/utils/compression.py +3 -3
unstructured_ingest/utils/data_prep.py +20 -12
unstructured_ingest/utils/string_and_date_utils.py +2 -2
unstructured_ingest/v2/cli/base/cmd.py +3 -3
unstructured_ingest/v2/cli/base/dest.py +1 -1
unstructured_ingest/v2/cli/base/src.py +3 -2
unstructured_ingest/v2/cli/utils/click.py +1 -1
unstructured_ingest/v2/interfaces/processor.py +48 -13
unstructured_ingest/v2/logger.py +1 -1
unstructured_ingest/v2/otel.py +1 -1
unstructured_ingest/v2/pipeline/interfaces.py +12 -3
unstructured_ingest/v2/pipeline/pipeline.py +42 -29
unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
unstructured_ingest/v2/pipeline/steps/download.py +17 -2
unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
unstructured_ingest/v2/pipeline/steps/index.py +2 -2
unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
unstructured_ingest/v2/processes/connectors/local.py +6 -5
unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
unstructured_ingest/v2/processes/embedder.py +41 -24
unstructured_ingest/v2/processes/filter.py +1 -1
unstructured_ingest/v2/processes/partitioner.py +3 -3
unstructured_ingest/v2/utils.py +7 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
unstructured_ingest/evaluate.py +0 -338
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0

unstructured_ingest/v2/pipeline/steps/stage.py CHANGED Viewed

@@ -31,7 +31,7 @@ class UploadStageStep(PipelineStep):
             self.process.upload_stager_config.json() if self.process.upload_stager_config else None
         )
         self.cache_dir.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str

unstructured_ingest/v2/pipeline/steps/uncompress.py CHANGED Viewed

@@ -23,7 +23,7 @@ class UncompressStep(PipelineStep):
     def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
-        logger.info(f"Created {self.identifier} with configs: {config}")
+        logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -6,6 +6,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
     add_source_entry,
 )
+from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
+from .airtable import airtable_source_entry
 from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
 from .astradb import astra_db_destination_entry
 from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
@@ -92,3 +94,4 @@ add_destination_entry(
 )
 add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
+add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)

unstructured_ingest/v2/processes/connectors/airtable.py ADDED Viewed

@@ -0,0 +1,235 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Optional
+from uuid import NAMESPACE_DNS, uuid5
+import pandas
+from pydantic import BaseModel, Field, Secret, field_validator
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    FileData,
+    Indexer,
+    IndexerConfig,
+    SourceIdentifiers,
+    download_responses,
+)
+from unstructured_ingest.v2.processes.connector_registry import (
+    SourceRegistryEntry,
+)
+if TYPE_CHECKING:
+    from pyairtable import Api
+    from pyairtable.api.types import RecordDict
+CONNECTOR_TYPE = "airtable"
+class AirtableTableMeta(BaseModel):
+    """Metadata specifying a table id, a base id which the table is stored in,
+    and an t.Optional view id in case particular rows and fields are to be ingested"""
+    base_id: str
+    table_id: str
+    view_id: Optional[str] = None
+    def get_id(self) -> str:
+        id_s = f"{self.base_id}{self.table_id}"
+        id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
+        return str(uuid5(NAMESPACE_DNS, id_s))
+class AirtableAccessConfig(AccessConfig):
+    personal_access_token: str = Field(
+        description="Personal access token to authenticate into Airtable. Check: "
+        "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
+        "for more info"
+    )
+class AirtableConnectionConfig(ConnectionConfig):
+    access_config: Secret[AirtableAccessConfig]
+    @requires_dependencies(["pyairtable"], extras="airtable")
+    def get_client(self) -> "Api":
+        from pyairtable import Api
+        access_config = self.access_config.get_secret_value()
+        return Api(api_key=access_config.personal_access_token)
+class AirtableIndexerConfig(IndexerConfig):
+    list_of_paths: Optional[list[str]] = Field(
+        default=None,
+        description="""
+        A list of paths that specify the locations to ingest data from within Airtable.
+        If this argument is not set, the connector ingests all tables within each and every base.
+        --list-of-paths: path1 path2 path3 ….
+        path: base_id/table_id(optional)/view_id(optional)/
+        To obtain (base, table, view) ids in bulk, check:
+        https://airtable.com/developers/web/api/list-bases (base ids)
+        https://airtable.com/developers/web/api/get-base-schema (table and view ids)
+        https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
+        To obtain specific ids from Airtable UI, go to your workspace, and copy any
+        relevant id from the URL structure:
+        https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
+        appAbcDeF1ghijKlm -> base_id
+        tblABcdEfG1HIJkLm -> table_id
+        viwABCDEfg6hijKLM -> view_id
+        You can also check: https://support.airtable.com/docs/finding-airtable-ids
+        Here is an example for one --list-of-paths:
+            base1/		→ gets the entirety of all tables inside base1
+            base1/table1		→ gets all rows and columns within table1 in base1
+            base1/table1/view1	→ gets the rows and columns that are
+                                  visible in view1 for the table1 in base1
+        Examples to invalid airtable_paths:
+            table1          → has to mention base to be valid
+            base1/view1     → has to mention table to be valid
+                """,
+    )
+    @classmethod
+    def validate_path(cls, path: str):
+        components = path.split("/")
+        if len(components) > 3:
+            raise ValueError(
+                f"Path must be of the format: base_id/table_id/view_id, "
+                f"where table id and view id are optional. Got: {path}"
+            )
+    @field_validator("list_of_paths")
+    @classmethod
+    def validate_format(cls, v: list[str]) -> list[str]:
+        for path in v:
+            cls.validate_path(path=path)
+        return v
+@dataclass
+class AirtableIndexer(Indexer):
+    connector_type: str = CONNECTOR_TYPE
+    connection_config: AirtableConnectionConfig
+    index_config: AirtableIndexerConfig
+    def get_all_table_meta(self) -> list[AirtableTableMeta]:
+        client = self.connection_config.get_client()
+        bases = client.bases()
+        airtable_meta = []
+        for base in bases:
+            for table in base.schema().tables:
+                airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
+        return airtable_meta
+    def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
+        client = self.connection_config.get_client()
+        base = client.base(base_id=base_id)
+        airtable_meta = []
+        for table in base.tables():
+            airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
+        return airtable_meta
+    def get_meta_from_list(self) -> list[AirtableTableMeta]:
+        airtable_meta = []
+        for path in self.index_config.list_of_paths:
+            components = path.split("/")
+            if len(components) == 1:
+                airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
+            elif len(components) == 2:
+                airtable_meta.append(
+                    AirtableTableMeta(base_id=components[0], table_id=components[1])
+                )
+            elif len(components) == 3:
+                airtable_meta.append(
+                    AirtableTableMeta(
+                        base_id=components[0], table_id=components[1], view_id=components[2]
+                    )
+                )
+            else:
+                raise ValueError(
+                    f"Path must be of the format: base_id/table_id/view_id, "
+                    f"where table id and view id are optional. Got: {path}"
+                )
+        return airtable_meta
+    def get_table_metas(self) -> list[AirtableTableMeta]:
+        if not self.index_config.list_of_paths:
+            return self.get_all_table_meta()
+        return self.get_meta_from_list()
+    def precheck(self) -> None:
+        client = self.connection_config.get_client()
+        client.request(method="HEAD", url=client.build_url("meta", "bases"))
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        table_metas = self.get_table_metas()
+        for table_meta in table_metas:
+            fullpath = (
+                f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
+                if table_meta.view_id
+                else f"{table_meta.base_id}/{table_meta.table_id}.csv"
+            )
+            yield FileData(
+                identifier=table_meta.get_id(),
+                connector_type=CONNECTOR_TYPE,
+                additional_metadata=table_meta.dict(),
+                source_identifiers=SourceIdentifiers(
+                    filename=str(Path(fullpath).name),
+                    fullpath=fullpath,
+                ),
+            )
+class AirtableDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class AirtableDownloader(Downloader):
+    connection_config: AirtableConnectionConfig
+    download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
+    connector_type: str = CONNECTOR_TYPE
+    def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
+        client = self.connection_config.get_client()
+        table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
+        table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
+        rows = table.all(**table_fetch_kwargs)
+        return rows
+    def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
+        row_dict = {
+            "id": table_row["id"],
+            "created_time": table_row["createdTime"],
+        }
+        row_dict.update(table_row["fields"])
+        return row_dict
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
+        table_contents = self.get_table_contents(table_meta=table_meta)
+        df = pandas.DataFrame.from_dict(
+            data=[self._table_row_to_dict(table_row=row) for row in table_contents]
+        ).sort_index(axis=1)
+        download_path = self.get_download_path(file_data=file_data)
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        df.to_csv(path_or_buf=download_path)
+        return self.generate_download_response(file_data=file_data, download_path=download_path)
+airtable_source_entry = SourceRegistryEntry(
+    indexer=AirtableIndexer,
+    indexer_config=AirtableIndexerConfig,
+    downloader=AirtableDownloader,
+    downloader_config=AirtableDownloaderConfig,
+    connection_config=AirtableConnectionConfig,
+)

unstructured_ingest/v2/processes/connectors/chroma.py CHANGED Viewed

@@ -41,9 +41,14 @@ class ChromaAccessConfig(AccessConfig):
     )
+SecretChromaAccessConfig = Secret[ChromaAccessConfig]
 class ChromaConnectionConfig(ConnectionConfig):
     collection_name: str = Field(description="The name of the Chroma collection to write into.")
-    access_config: Secret[ChromaAccessConfig]
+    access_config: SecretChromaAccessConfig = Field(
+        default=SecretChromaAccessConfig(secret_value=ChromaAccessConfig())
+    )
     path: Optional[str] = Field(
         default=None, description="Location where Chroma is persisted, if not connecting via http."
     )

unstructured_ingest/v2/processes/connectors/elasticsearch.py CHANGED Viewed

@@ -104,7 +104,7 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
         elif access_config.es_api_key:
             client_input_kwargs["api_key"] = access_config.es_api_key
         client_input = ElasticsearchClientInput(**client_input_kwargs)
-        logger.debug(f"Elasticsearch client inputs mapped to: {client_input.dict()}")
+        logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
         client_kwargs = client_input.dict()
         client_kwargs["basic_auth"] = (
             client_input.basic_auth.get_secret_value() if client_input.basic_auth else None

unstructured_ingest/v2/processes/connectors/fsspec/box.py CHANGED Viewed

@@ -47,7 +47,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
     connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
     def get_access_config(self) -> dict[str, Any]:
-        # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
+        # Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
         # because it is not serializable.
         from boxsdk import JWTAuth

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -317,9 +317,9 @@ class FsspecUploader(Uploader):
         path_str = str(path.resolve())
         upload_path = self.get_upload_path(file_data=file_data)
         if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
-            logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
+            logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
             return
-        logger.debug(f"Writing local file {path_str} to {upload_path}")
+        logger.debug(f"writing local file {path_str} to {upload_path}")
         self.fs.upload(lpath=path_str, rpath=str(upload_path))
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -328,7 +328,7 @@ class FsspecUploader(Uploader):
         # Odd that fsspec doesn't run exists() as async even when client support async
         already_exists = self.fs.exists(path=str(upload_path))
         if already_exists and not self.upload_config.overwrite:
-            logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
+            logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
             return
-        logger.debug(f"Writing local file {path_str} to {upload_path}")
+        logger.debug(f"writing local file {path_str} to {upload_path}")
         self.fs.upload(lpath=path_str, rpath=str(upload_path))

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -28,8 +28,7 @@ from unstructured_ingest.v2.interfaces import (
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
-from .utils import conform_string_to_dict
+from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
 CONNECTOR_TYPE = "google_drive"
@@ -200,7 +199,7 @@ class GoogleDriveIndexer(Indexer):
         if extensions:
             ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
             q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
-        logger.debug(f"Query used when indexing: {q}")
+        logger.debug(f"query used when indexing: {q}")
         logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
         done = False
         page_token = None

unstructured_ingest/v2/processes/connectors/local.py CHANGED Viewed

@@ -180,14 +180,15 @@ class LocalUploader(Uploader):
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         if source_identifiers := file_data.source_identifiers:
-            identifiers = source_identifiers
             rel_path = (
-                identifiers.relative_path[1:]
-                if identifiers.relative_path.startswith("/")
-                else identifiers.relative_path
+                source_identifiers.relative_path[1:]
+                if source_identifiers.relative_path.startswith("/")
+                else source_identifiers.relative_path
             )
             new_path = self.upload_config.output_path / Path(rel_path)
-            final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
+            final_path = str(new_path).replace(
+                source_identifiers.filename, f"{source_identifiers.filename}.json"
+            )
         else:
             final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
         Path(final_path).parent.mkdir(parents=True, exist_ok=True)

unstructured_ingest/v2/processes/connectors/milvus.py CHANGED Viewed

@@ -71,7 +71,7 @@ class MilvusUploadStagerConfig(UploadStagerConfig):
     fields_to_include: Optional[list[str]] = None
     """If set - list of fields to include in the output.
     Unspecified fields are removed from the elements.
-    This action takse place after metadata flattening.
+    This action takes place after metadata flattening.
     Missing fields will cause stager to throw KeyError."""
     flatten_metadata: bool = True

unstructured_ingest/v2/processes/connectors/onedrive.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import json
 from dataclasses import dataclass
 from pathlib import Path
@@ -103,7 +105,7 @@ class OnedriveIndexer(Indexer):
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise SourceConnectionError(f"failed to validate connection: {e}")
-    def list_objects(self, folder, recursive) -> list["DriveItem"]:
+    def list_objects(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
         drive_items = folder.children.get().execute_query()
         files = [d for d in drive_items if d.is_file]
         if not recursive:
@@ -139,12 +141,12 @@ class OnedriveIndexer(Indexer):
         server_path = file_path + "/" + filename
         rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
         date_modified_dt = (
-            parser.parse(drive_item.last_modified_datetime)
+            parser.parse(str(drive_item.last_modified_datetime))
             if drive_item.last_modified_datetime
             else None
         )
         date_created_at = (
-            parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
+            parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
         )
         return FileData(
             identifier=drive_item.id,
@@ -156,7 +158,7 @@ class OnedriveIndexer(Indexer):
                 url=drive_item.parent_reference.path + "/" + drive_item.name,
                 version=drive_item.etag,
                 date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
-                date_created=str(date_created_at.timestamp()) if date_modified_dt else None,
+                date_created=str(date_created_at.timestamp()) if date_created_at else None,
                 date_processed=str(time()),
                 record_locator={
                     "user_pname": self.connection_config.user_pname,
@@ -211,9 +213,9 @@ class OnedriveDownloader(Downloader):
         fsize = file.get_property("size", 0)
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Downloading {file_data.source_identifiers.fullpath} to {download_path}")
+        logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
         if fsize > MAX_MB_SIZE:
-            logger.info(f"Downloading file with size: {fsize} bytes in chunks")
+            logger.info(f"downloading file with size: {fsize} bytes in chunks")
             with download_path.open(mode="wb") as f:
                 file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
         else:

unstructured_ingest/v2/processes/connectors/opensearch.py CHANGED Viewed

@@ -101,7 +101,7 @@ class OpenSearchConnectionConfig(ConnectionConfig):
         if self.username and access_config.password:
             client_input_kwargs["http_auth"] = (self.username, access_config.password)
         client_input = OpenSearchClientInput(**client_input_kwargs)
-        logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
+        logger.debug(f"opensearch client inputs mapped to: {client_input.dict()}")
         client_kwargs = client_input.dict()
         if client_input.http_auth is not None:
             client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -27,6 +27,7 @@ if TYPE_CHECKING:
 CONNECTOR_TYPE = "pinecone"
 MAX_PAYLOAD_SIZE = 2 * 1024 * 1024  # 2MB
+MAX_POOL_THREADS = 100
 class PineconeAccessConfig(AccessConfig):
@@ -45,7 +46,7 @@ class PineconeConnectionConfig(ConnectionConfig):
     )
     @requires_dependencies(["pinecone"], extras="pinecone")
-    def get_index(self) -> "PineconeIndex":
+    def get_index(self, **index_kwargs) -> "PineconeIndex":
         from pinecone import Pinecone
         from unstructured_ingest import __version__ as unstructured_version
@@ -55,8 +56,8 @@ class PineconeConnectionConfig(ConnectionConfig):
             source_tag=f"unstructured_ingest=={unstructured_version}",
         )
-        index = pc.Index(self.index_name)
-        logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
+        index = pc.Index(name=self.index_name, **index_kwargs)
+        logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
         return index
@@ -65,7 +66,13 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
 class PineconeUploaderConfig(UploaderConfig):
-    batch_size: int = Field(default=100, description="Number of records per batch")
+    batch_size: Optional[int] = Field(
+        default=None,
+        description="Optional number of records per batch. Will otherwise limit by size.",
+    )
+    pool_threads: Optional[int] = Field(
+        default=1, description="Optional limit on number of threads to use for upload"
+    )
 ALLOWED_FIELDS = (
@@ -149,29 +156,44 @@ class PineconeUploader(Uploader):
             raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["pinecone"], extras="pinecone")
-    def upsert_batch(self, batch):
+    def upsert_batches_async(self, elements_dict: list[dict]):
         from pinecone.exceptions import PineconeApiException
-        try:
-            index = self.connection_config.get_index()
-            response = index.upsert(batch)
-        except PineconeApiException as api_error:
-            raise DestinationConnectionError(f"http error: {api_error}") from api_error
-        logger.debug(f"results: {response}")
+        chunks = list(
+            generator_batching_wbytes(
+                iterable=elements_dict,
+                batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
+                max_batch_size=self.upload_config.batch_size,
+            )
+        )
+        logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
+        max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
+        if self.upload_config.pool_threads:
+            pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
+        else:
+            pool_threads = max_pool_threads
+        index = self.connection_config.get_index(pool_threads=pool_threads)
+        with index:
+            async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks]
+            # Wait for and retrieve responses (this raises in case of error)
+            try:
+                results = [async_result.get() for async_result in async_results]
+            except PineconeApiException as api_error:
+                raise DestinationConnectionError(f"http error: {api_error}") from api_error
+            logger.debug(f"results: {results}")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         with path.open("r") as file:
             elements_dict = json.load(file)
         logger.info(
-            f"writing document batches to destination"
+            f"writing a total of {len(elements_dict)} elements via"
+            f" document batches to destination"
             f" index named {self.connection_config.index_name}"
             f" with batch size {self.upload_config.batch_size}"
         )
-        for batch in generator_batching_wbytes(
-            elements_dict, MAX_PAYLOAD_SIZE - 100, self.upload_config.batch_size
-        ):
-            self.upsert_batch(batch=batch)
+        self.upsert_batches_async(elements_dict=elements_dict)
 pinecone_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/sharepoint.py CHANGED Viewed

@@ -60,13 +60,16 @@ class SharepointAccessConfig(AccessConfig):
 class SharepointPermissionsConfig(BaseModel):
-    permissions_application_id: str = Field(description="Microsoft Graph API application id")
-    permissions_tenant: str = Field(
+    permissions_application_id: Optional[str] = Field(
+        default=None, description="Microsoft Graph API application id"
+    )
+    permissions_tenant: Optional[str] = Field(
+        default=None,
         description="url to get permissions data within tenant.",
         examples=["https://contoso.onmicrosoft.com"],
     )
-    permissions_client_cred: SecretStr = Field(
-        description="Microsoft Graph API application credentials"
+    permissions_client_cred: Optional[SecretStr] = Field(
+        default=None, description="Microsoft Graph API application credentials"
     )
     authority_url: Optional[SecretStr] = Field(
         repr=False,
@@ -139,7 +142,7 @@ class SharepointConnectionConfig(ConnectionConfig):
 class SharepointIndexerConfig(IndexerConfig):
     path: Optional[str] = Field(
-        defaul=None,
+        default=None,
         description="Path from which to start parsing files. If the connector is to \
                 process all sites within the tenant this filter will be applied to \
                 all sites document libraries.",
@@ -335,7 +338,8 @@ class SharepointIndexer(Indexer):
     @property
     def process_permissions(self) -> bool:
         return (
-            self.connection_config.permissions_config.permissions_tenant
+            self.connection_config.permissions_config is not None
+            and self.connection_config.permissions_config.permissions_tenant
             and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
             and self.connection_config.permissions_config.permissions_application_id
         )

unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl