PyPI - unstructured-ingest - Versions diffs - 1.0.40__py3-none-any.whl → 1.0.44__py3-none-any.whl - Mend

unstructured-ingest 1.0.40py3-none-any.whl → 1.0.44py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (25) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.40" # pragma: no cover
1	+ __version__ = "1.0.44" # pragma: no cover

unstructured_ingest/embed/togetherai.py CHANGED Viewed

@@ -22,7 +22,7 @@ if TYPE_CHECKING:
 class TogetherAIEmbeddingConfig(EmbeddingConfig):
     api_key: SecretStr = Field(description="API key for Together AI")
     embedder_model_name: str = Field(
-        default="togethercomputer/m2-bert-80M-8k-retrieval",
+        default="togethercomputer/m2-bert-80M-32k-retrieval",
         alias="model_name",
         description="Together AI model name",
     )

unstructured_ingest/processes/connectors/airtable.py CHANGED Viewed

@@ -184,6 +184,7 @@ class AirtableIndexer(Indexer):
                     filename=str(Path(fullpath).name),
                     fullpath=fullpath,
                 ),
+                display_name=fullpath,
             )

unstructured_ingest/processes/connectors/astradb.py CHANGED Viewed

@@ -195,8 +195,12 @@ class AstraDBIndexer(Indexer):
         all_ids = self._get_doc_ids()
         ids = list(all_ids)
         id_batches = batch_generator(ids, self.index_config.batch_size)
         for batch in id_batches:
+            batch_items = [BatchItem(identifier=b) for b in batch]
+            display_name = (
+                f"{self.index_config.collection_name}-{self.index_config.keyspace}"
+                f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
+            )
             fd = AstraDBBatchFileData(
                 connector_type=CONNECTOR_TYPE,
                 metadata=FileDataSourceMetadata(
@@ -206,7 +210,8 @@ class AstraDBIndexer(Indexer):
                     collection_name=self.index_config.collection_name,
                     keyspace=self.index_config.keyspace,
                 ),
-                batch_items=[BatchItem(identifier=b) for b in batch],
+                batch_items=batch_items,
+                display_name=display_name,
             )
             yield fd

unstructured_ingest/processes/connectors/confluence.py CHANGED Viewed

@@ -186,12 +186,15 @@ class ConfluenceIndexer(Indexer):
             pages = client.get_all_pages_from_space(
                 space=space_key,
                 start=0,
-                limit=self.index_config.max_num_of_docs_from_each_space,
                 expand=None,
                 content_type="page",  # blogpost and comment types not currently supported
                 status=None,
             )
-        doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in pages]
+        # Limit the number of documents to max_num_of_docs_from_each_space
+        # Note: this is needed because the limit field in client.get_all_pages_from_space does
+        # not seem to work as expected
+        limited_pages = pages[: self.index_config.max_num_of_docs_from_each_space]
+        doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in limited_pages]
         return doc_ids
     def run(self) -> Generator[FileData, None, None]:
@@ -233,6 +236,7 @@ class ConfluenceIndexer(Indexer):
                     metadata=metadata,
                     additional_metadata=additional_metadata,
                     source_identifiers=source_identifiers,
+                    display_name=source_identifiers.fullpath,
                 )
                 yield file_data

unstructured_ingest/processes/connectors/databricks/volumes.py CHANGED Viewed

@@ -133,14 +133,15 @@ class DatabricksVolumesIndexer(Indexer, ABC):
                 if rel_path.startswith("/"):
                     rel_path = rel_path[1:]
                 filename = Path(file_info.path).name
+                source_identifiers = SourceIdentifiers(
+                    filename=filename,
+                    rel_path=rel_path,
+                    fullpath=file_info.path,
+                )
                 yield FileData(
                     identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
                     connector_type=self.connector_type,
-                    source_identifiers=SourceIdentifiers(
-                        filename=filename,
-                        rel_path=rel_path,
-                        fullpath=file_info.path,
-                    ),
+                    source_identifiers=source_identifiers,
                     additional_metadata={
                         "catalog": self.index_config.catalog,
                         "path": file_info.path,
@@ -148,6 +149,7 @@ class DatabricksVolumesIndexer(Indexer, ABC):
                     metadata=FileDataSourceMetadata(
                         url=file_info.path, date_modified=str(file_info.modification_time)
                     ),
+                    display_name=source_identifiers.fullpath,
                 )
         except Exception as e:
             raise self.connection_config.wrap_error(e=e)

unstructured_ingest/processes/connectors/delta_table.py CHANGED Viewed

@@ -1,7 +1,7 @@
-import os
+import logging
 import traceback
 from dataclasses import dataclass, field
-from multiprocessing import Process, Queue
+from multiprocessing import Process, Queue, current_process
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional
 from urllib.parse import urlparse
@@ -20,6 +20,7 @@ from unstructured_ingest.interfaces import (
 )
 from unstructured_ingest.logger import logger
 from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
+from unstructured_ingest.utils.constants import RECORD_ID_LABEL
 from unstructured_ingest.utils.data_prep import get_data_df, get_json_data
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.utils.table import convert_to_pandas_dataframe
@@ -47,18 +48,17 @@ class DeltaTableAccessConfig(AccessConfig):
 class DeltaTableConnectionConfig(ConnectionConfig):
     access_config: Secret[DeltaTableAccessConfig] = Field(
-        default=DeltaTableAccessConfig(), validate_default=True
+        default=Secret(DeltaTableAccessConfig()), validate_default=True
     )
     aws_region: Optional[str] = Field(default=None, description="AWS Region")
     table_uri: str = Field(
-        default=None,
         description=(
             "Local path or path to the target folder in the S3 bucket, "
             "formatted as s3://my-bucket/my-folder/"
         ),
     )
-    def update_storage_options(self, storage_options: dict) -> None:
+    def update_storage_options(self, storage_options: dict[str, str]) -> None:
         secrets = self.access_config.get_secret_value()
         if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
             storage_options["AWS_REGION"] = self.aws_region
@@ -80,9 +80,10 @@ class DeltaTableUploadStager(UploadStager):
         default_factory=lambda: DeltaTableUploadStagerConfig()
     )
-    def run(
+    def run(  # type: ignore[override]
         self,
         elements_filepath: Path,
+        file_data: FileData,
         output_dir: Path,
         output_filename: str,
         **kwargs: Any,
@@ -91,6 +92,8 @@ class DeltaTableUploadStager(UploadStager):
         output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
         df = convert_to_pandas_dataframe(elements_dict=elements_contents)
+        # Ensure per-record overwrite/delete semantics: tag each row with the record identifier
+        df[RECORD_ID_LABEL] = file_data.identifier
         df = df.dropna(axis=1, how="all")
         df.to_parquet(output_path)
@@ -138,41 +141,92 @@ class DeltaTableUploader(Uploader):
                 logger.error(f"failed to validate connection: {e}", exc_info=True)
                 raise DestinationConnectionError(f"failed to validate connection: {e}")
+    @requires_dependencies(["tenacity"], extras="delta-table")
     def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
-        updated_upload_path = os.path.join(
-            self.connection_config.table_uri, file_data.source_identifiers.relative_path
-        )
+        upload_path = self.connection_config.table_uri
         logger.info(
-            f"writing {len(df)} rows to destination table "
-            f"at {updated_upload_path}\ndtypes: {df.dtypes}",
+            f"writing {len(df)} rows to destination table at {upload_path}\ndtypes: {df.dtypes}",
         )
-        storage_options = {}
+        storage_options: dict[str, str] = {}
         self.connection_config.update_storage_options(storage_options=storage_options)
+        # Decide whether the Delta table already exists. If it does, we first delete all rows
+        # belonging to the current record and then append the fresh data. Otherwise we will
+        # create a brand-new table via an overwrite.
+        mode = "overwrite"
+        try:
+            from deltalake import DeltaTable  # pylint: disable=import-error
+            dt = DeltaTable(upload_path, storage_options=storage_options)
+            logger.debug(f"Table exists: deleting rows for {file_data.identifier}")
+            # Table exists – remove any previous rows for this record_id so that appending is
+            # effectively an idempotent overwrite for the record.
+            dt.delete(predicate=f"{RECORD_ID_LABEL} = '{file_data.identifier}'")
+            mode = "append"
+        except Exception:
+            # Table does not exist yet (or cannot be opened) – we will create it below with
+            # mode="overwrite". All other failures will be captured later by the writer.
+            logger.debug("Table does not exist: creating new table")
         writer_kwargs = {
-            "table_or_uri": updated_upload_path,
+            "table_or_uri": upload_path,
             "data": df,
-            "mode": "overwrite",
+            "mode": mode,
             "schema_mode": "merge",
             "storage_options": storage_options,
         }
-        queue = Queue()
-        # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
-        # ingest to fail, even though all tasks are completed normally. Putting the writer into a
-        # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
-        # rust backend to finish
-        writer = Process(
-            target=write_deltalake_with_error_handling,
-            kwargs={"queue": queue, **writer_kwargs},
+        from tenacity import (
+            before_log,
+            retry,
+            retry_if_exception,
+            stop_after_attempt,
+            wait_random,
         )
-        writer.start()
-        writer.join()
-        # Check if the queue has any exception message
-        if not queue.empty():
-            error_message = queue.get()
-            logger.error(f"Exception occurred in write_deltalake: {error_message}")
-            raise RuntimeError(f"Error in write_deltalake: {error_message}")
+        def _is_commit_conflict(exc: BaseException) -> bool:  # noqa: ANN401
+            """Return True if exception looks like a Delta Lake commit conflict."""
+            return isinstance(exc, RuntimeError) and (
+                "CommitFailed" in str(exc) or "Metadata changed" in str(exc)
+            )
+        @retry(
+            stop=stop_after_attempt(10),
+            wait=wait_random(min=0.2, max=1.0),
+            before=before_log(logger, logging.DEBUG),
+            retry=retry_if_exception(_is_commit_conflict),
+            reraise=True,
+        )
+        def _single_attempt() -> None:
+            """One optimistic transaction: delete old rows, then append new ones."""
+            # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and
+            # cause ingest to fail, even though all tasks are completed normally. Putting the writer
+            # into a process mitigates this issue by ensuring python interpreter waits properly for
+            # deltalake's rust backend to finish
+            queue: Queue[str] = Queue()
+            if current_process().daemon:
+                # write_deltalake_with_error_handling will push any traceback to our queue
+                write_deltalake_with_error_handling(queue=queue, **writer_kwargs)
+            else:
+                # On non-daemon processes we still guard against SIGABRT by running in a subprocess.
+                writer = Process(
+                    target=write_deltalake_with_error_handling,
+                    kwargs={"queue": queue, **writer_kwargs},
+                )
+                writer.start()
+                writer.join()
+            # Check if the queue has any exception message
+            if not queue.empty():
+                error_message = queue.get()
+                logger.error("Exception occurred in write_deltalake: %s", error_message)
+                raise RuntimeError(f"Error in write_deltalake: {error_message}")
+        _single_attempt()
     @requires_dependencies(["pandas"], extras="delta-table")
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
@@ -182,7 +236,7 @@ class DeltaTableUploader(Uploader):
         self.upload_dataframe(df=df, file_data=file_data)
     @requires_dependencies(["pandas"], extras="delta-table")
-    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:  # type: ignore[override]
         df = get_data_df(path)
         self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/discord.py CHANGED Viewed

@@ -91,6 +91,7 @@ class DiscordIndexer(Indexer):
             connector_type=CONNECTOR_TYPE,
             source_identifiers=source_identifiers,
             metadata=metadata,
+            display_name=source_identifiers.fullpath,
         )

unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py CHANGED Viewed

@@ -199,17 +199,24 @@ class ElasticsearchIndexer(Indexer):
         all_ids = self._get_doc_ids()
         ids = list(all_ids)
         for batch in batch_generator(ids, self.index_config.batch_size):
+            batch_items = [BatchItem(identifier=b) for b in batch]
+            url = f"{self.connection_config.hosts[0]}/{self.index_config.index_name}"
+            display_name = (
+                f"url={url}, batch_size={len(batch_items)} "
+                f"ids={batch_items[0].identifier}..{batch_items[-1].identifier}"
+            )  # noqa: E501
             # Make sure the hash is always a positive number to create identified
             yield ElasticsearchBatchFileData(
                 connector_type=CONNECTOR_TYPE,
                 metadata=FileDataSourceMetadata(
-                    url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
+                    url=url,
                     date_processed=str(time()),
                 ),
                 additional_metadata=ElastisearchAdditionalMetadata(
                     index_name=self.index_config.index_name,
                 ),
-                batch_items=[BatchItem(identifier=b) for b in batch],
+                batch_items=batch_items,
+                display_name=display_name,
             )

unstructured_ingest/processes/connectors/gitlab.py CHANGED Viewed

@@ -190,21 +190,22 @@ class GitLabIndexer(Indexer):
                     "file_path": file["path"],
                     "ref": ref,
                 }
+                source_identifiers = SourceIdentifiers(
+                    fullpath=file["path"],
+                    filename=Path(file["path"]).name,
+                    rel_path=relative_path,
+                )
                 yield FileData(
                     identifier=file["id"],
                     connector_type=CONNECTOR_TYPE,
-                    source_identifiers=SourceIdentifiers(
-                        fullpath=file["path"],
-                        filename=Path(file["path"]).name,
-                        rel_path=relative_path,
-                    ),
+                    source_identifiers=source_identifiers,
                     metadata=FileDataSourceMetadata(
                         url=file["id"],
                         record_locator=record_locator,
                         permissions_data=[{"mode": file["mode"]}],
                     ),
                     additional_metadata={},
+                    display_name=source_identifiers.fullpath,
                 )

unstructured_ingest/processes/connectors/jira.py CHANGED Viewed

@@ -262,6 +262,7 @@ class JiraIndexer(Indexer):
             metadata=metadata,
             additional_metadata=issue.model_dump(),
             source_identifiers=source_identifiers,
+            display_name=source_identifiers.fullpath,
         )
         return file_data

unstructured_ingest/processes/connectors/local.py CHANGED Viewed

@@ -119,21 +119,21 @@ class LocalIndexer(Indexer):
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
         for file_path in self.list_files():
+            source_identifiers = SourceIdentifiers(
+                fullpath=str(file_path.resolve()),
+                filename=file_path.name,
+                rel_path=(
+                    str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[1:]
+                    if not self.index_config.path.is_file()
+                    else self.index_config.path.name
+                ),
+            )
             file_data = FileData(
                 identifier=str(file_path.resolve()),
                 connector_type=CONNECTOR_TYPE,
-                source_identifiers=SourceIdentifiers(
-                    fullpath=str(file_path.resolve()),
-                    filename=file_path.name,
-                    rel_path=(
-                        str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
-                            1:
-                        ]
-                        if not self.index_config.path.is_file()
-                        else self.index_config.path.name
-                    ),
-                ),
+                source_identifiers=source_identifiers,
                 metadata=self.get_file_metadata(path=file_path),
+                display_name=source_identifiers.fullpath,
             )
             yield file_data

unstructured_ingest/processes/connectors/mongodb.py CHANGED Viewed

@@ -149,6 +149,10 @@ class MongoDBIndexer(Indexer):
         for id_batch in batch_generator(ids, batch_size=batch_size):
             # Make sure the hash is always a positive number to create identifier
+            display_name = (
+                f"{self.index_config.database}.{self.index_config.collection}, "
+                f"batch {id_batch[0]}-{id_batch[-1]}"
+            )
             metadata = FileDataSourceMetadata(
                 date_processed=str(time()),
                 record_locator={
@@ -164,6 +168,7 @@ class MongoDBIndexer(Indexer):
                 additional_metadata=MongoDBAdditionalMetadata(
                     collection=self.index_config.collection, database=self.index_config.database
                 ),
+                display_name=display_name,
             )
             yield file_data

unstructured_ingest/processes/connectors/notion/connector.py CHANGED Viewed

@@ -174,6 +174,7 @@ class NotionIndexer(Indexer):
                 source_identifiers=source_identifiers,
                 metadata=metadata,
                 additional_metadata=additional_metadata,
+                display_name=source_identifiers.fullpath,
             )
         except Exception as e:
             logger.error(f"Error retrieving page {page_id}: {e}")
@@ -210,6 +211,7 @@ class NotionIndexer(Indexer):
                 source_identifiers=source_identifiers,
                 metadata=metadata,
                 additional_metadata=additional_metadata,
+                display_name=source_identifiers.fullpath,
             )
         except Exception as e:
             logger.error(f"Error retrieving database {database_id}: {e}")

unstructured_ingest/processes/connectors/onedrive.py CHANGED Viewed

@@ -223,6 +223,7 @@ class OnedriveIndexer(Indexer):
                 },
             ),
             additional_metadata=self.get_properties_sync(drive_item=drive_item),
+            display_name=server_path,
         )
     async def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:

unstructured_ingest/processes/connectors/outlook.py CHANGED Viewed

@@ -149,11 +149,11 @@ class OutlookIndexer(Indexer):
     def _message_to_file_data(self, message: "Message") -> FileData:
         fullpath = self._generate_fullpath(message)
+        source_identifiers = SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath))
         return FileData(
             identifier=message.id,
             connector_type=CONNECTOR_TYPE,
-            source_identifiers=SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath)),
+            source_identifiers=source_identifiers,
             metadata=FileDataSourceMetadata(
                 url=message.resource_url,
                 version=message.change_key,
@@ -178,6 +178,7 @@ class OutlookIndexer(Indexer):
                 "has_attachments": message.has_attachments,
                 "importance": message.importance,
             },
+            display_name=source_identifiers.fullpath,
         )
     def _generate_fullpath(self, message: "Message") -> Path:

unstructured_ingest/processes/connectors/salesforce.py CHANGED Viewed

@@ -182,14 +182,15 @@ class SalesforceIndexer(Indexer):
                     record_with_extension = record["Id"] + self.get_file_extension(
                         record["attributes"]["type"]
                     )
+                    source_identifiers = SourceIdentifiers(
+                        filename=record_with_extension,
+                        fullpath=f"{record['attributes']['type']}/{record_with_extension}",
+                    )
                     files_list.append(
                         FileData(
                             connector_type=CONNECTOR_TYPE,
                             identifier=record["Id"],
-                            source_identifiers=SourceIdentifiers(
-                                filename=record_with_extension,
-                                fullpath=f"{record['attributes']['type']}/{record_with_extension}",
-                            ),
+                            source_identifiers=source_identifiers,
                             metadata=FileDataSourceMetadata(
                                 url=record["attributes"]["url"],
                                 version=str(parser.parse(record["SystemModstamp"]).timestamp()),
@@ -200,6 +201,7 @@ class SalesforceIndexer(Indexer):
                                 record_locator={"id": record["Id"]},
                             ),
                             additional_metadata={"record_type": record["attributes"]["type"]},
+                            display_name=source_identifiers.fullpath,
                         )
                     )
             except SalesforceMalformedRequest as e:

unstructured_ingest/processes/connectors/slack.py CHANGED Viewed

@@ -122,12 +122,13 @@ class SlackIndexer(Indexer):
         identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
         filename = identifier[:16]
+        source_identifiers = SourceIdentifiers(
+            filename=f"{filename}.xml", fullpath=f"{filename}.xml"
+        )
         return FileData(
             identifier=identifier,
             connector_type=CONNECTOR_TYPE,
-            source_identifiers=SourceIdentifiers(
-                filename=f"{filename}.xml", fullpath=f"{filename}.xml"
-            ),
+            source_identifiers=source_identifiers,
             metadata=FileDataSourceMetadata(
                 date_created=ts_oldest,
                 date_modified=ts_newest,
@@ -138,6 +139,7 @@ class SlackIndexer(Indexer):
                     "latest": ts_newest,
                 },
             ),
+            display_name=source_identifiers.fullpath,
         )
     @SourceConnectionError.wrap

unstructured_ingest/processes/connectors/sql/sql.py CHANGED Viewed

@@ -130,7 +130,13 @@ class SQLIndexer(Indexer, ABC):
                 (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
             )
         ]
         for batch in id_batches:
+            batch_items = [BatchItem(identifier=str(b)) for b in batch]
+            display_name = (
+                f"{self.index_config.table_name}-{self.index_config.id_column}"
+                f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
+            )
             # Make sure the hash is always a positive number to create identified
             yield SqlBatchFileData(
                 connector_type=self.connector_type,
@@ -140,7 +146,8 @@ class SQLIndexer(Indexer, ABC):
                 additional_metadata=SqlAdditionalMetadata(
                     table_name=self.index_config.table_name, id_column=self.index_config.id_column
                 ),
-                batch_items=[BatchItem(identifier=str(b)) for b in batch],
+                batch_items=batch_items,
+                display_name=display_name,
             )

unstructured_ingest/processes/connectors/zendesk/zendesk.py CHANGED Viewed

@@ -86,12 +86,13 @@ class ZendeskIndexer(Indexer):
     async def get_tickets(self) -> AsyncGenerator[ZendeskFileData, None]:
         async with self.connection_config.get_client() as client:
             async for ticket in client.get_tickets():
+                source_identifiers = SourceIdentifiers(
+                    filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
+                )
                 yield ZendeskFileData(
                     identifier=str(ticket.id),
                     connector_type=self.connector_type,
-                    source_identifiers=SourceIdentifiers(
-                        filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
-                    ),
+                    source_identifiers=source_identifiers,
                     additional_metadata=ZendeskAdditionalMetadata(
                         item_type="ticket", content=ticket
                     ),
@@ -101,17 +102,19 @@ class ZendeskIndexer(Indexer):
                         date_modified=ticket.updated_at.isoformat() if ticket.updated_at else None,
                         date_processed=str(time()),
                     ),
+                    display_name=source_identifiers.fullpath,
                 )
     async def get_articles(self) -> AsyncGenerator[ZendeskFileData, None]:
         async with self.connection_config.get_client() as client:
             async for article in client.get_articles():
+                source_identifiers = SourceIdentifiers(
+                    filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
+                )
                 yield ZendeskFileData(
                     identifier=str(article.id),
                     connector_type=self.connector_type,
-                    source_identifiers=SourceIdentifiers(
-                        filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
-                    ),
+                    source_identifiers=source_identifiers,
                     additional_metadata=ZendeskAdditionalMetadata(
                         item_type="article", content=article
                     ),
@@ -123,6 +126,7 @@ class ZendeskIndexer(Indexer):
                         ),
                         date_processed=str(time()),
                     ),
+                    display_name=source_identifiers.fullpath,
                 )
     async def run_async(self, **kwargs: Any) -> AsyncGenerator[ZendeskFileData, None]:

{unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: unstructured_ingest
-Version: 1.0.40
+Version: 1.0.44
 Summary: Local ETL data pipeline to get data RAG ready
 Author-email: Unstructured Technologies <devops@unstructuredai.io>
 License-Expression: Apache-2.0
@@ -60,6 +60,7 @@ Provides-Extra: delta-table
 Requires-Dist: boto3; extra == 'delta-table'
 Requires-Dist: deltalake; extra == 'delta-table'
 Requires-Dist: pandas; extra == 'delta-table'
+Requires-Dist: tenacity; extra == 'delta-table'
 Provides-Extra: discord
 Requires-Dist: discord-py; extra == 'discord'
 Provides-Extra: doc

{unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=Nh8AH5kdHKmvlr1cLANF544IyKeypbci9JhgVAy8F10,43
+unstructured_ingest/__version__.py,sha256=12SSwrWI8zU57pbaRSeJH9dGmuvWZXi056-PfBAhJTw,43
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
 unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -29,7 +29,7 @@ unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFX
 unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
 unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
 unstructured_ingest/embed/openai.py,sha256=TMEOPVfm_OSs4tb3Ymd6q5J49R_-YKvO4TOqCHb3bwk,4647
-unstructured_ingest/embed/togetherai.py,sha256=EehrzTRx4sd_P6AG9JkHAGwTG-o93GMaV5ufmJaxKWs,3629
+unstructured_ingest/embed/togetherai.py,sha256=ykaveEUBxBGBzRlmWc9utCFQuUWHdbW4F9KAb-uBAJM,3630
 unstructured_ingest/embed/vertexai.py,sha256=DphvPhiYdXTMrQxJCd-64vMs4iVdLY_BphHqz3n5HfM,3758
 unstructured_ingest/embed/voyageai.py,sha256=EOrYzaoXOZ6C4fNkMlCgb8KA8rdfgVXN3USMFpnn0Bs,4698
 unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
@@ -62,37 +62,37 @@ unstructured_ingest/processes/filter.py,sha256=oc3SYukRYfzx8sdJqF3KxdwZcrA-1U8PT
 unstructured_ingest/processes/partitioner.py,sha256=Kn_BSFYvOkwo8fqThw_cOpgD0Um-AdoSqclZplcdNBA,10109
 unstructured_ingest/processes/uncompress.py,sha256=o9JL3Bza4KPUTmrB39-v_5SuK_fYwhwFAhjQi2Pm8h8,2426
 unstructured_ingest/processes/connectors/__init__.py,sha256=cR4ZH2dpPod7QR6OsgMx8X9kpFcEc1TVfQndUNoKGzI,6812
-unstructured_ingest/processes/connectors/airtable.py,sha256=smx5qBSUKwM8V6Xcc7ikrf8hYQUQ94YrB1L0WVeRDv0,9024
-unstructured_ingest/processes/connectors/astradb.py,sha256=Ob9wQgDxa6BXDPZBOqooNKQgvjIZcMwIe4fW3VlI7h8,18929
+unstructured_ingest/processes/connectors/airtable.py,sha256=dDZDKim8ON0yMHv-7cxutjllV4iM9x0RZg0yfP2wQpM,9063
+unstructured_ingest/processes/connectors/astradb.py,sha256=qi9G3s88GYSV3TXNrbcO0n32SuxO-uagtUIodjgyKVU,19216
 unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE2hGFfjGc_jNFzlUwiRlCtIkuu7tmnk,11524
 unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
-unstructured_ingest/processes/connectors/confluence.py,sha256=SPPZzpNXbS5xyha9fxofFNBCB1irRMbogp8po9S1Z7k,22539
+unstructured_ingest/processes/connectors/confluence.py,sha256=aA2B_FPdAjlVAJtmMldYu6lld2sR-6JL5tWh7yItiwg,22828
 unstructured_ingest/processes/connectors/couchbase.py,sha256=KCHoYDNya9B05NIB5D78zXoizFyfpJRepcYBe1nLSOs,12298
-unstructured_ingest/processes/connectors/delta_table.py,sha256=2DFox_Vzoopt_D3Jy3rCjrrTGMutG2INIrwCeoIohRY,7340
-unstructured_ingest/processes/connectors/discord.py,sha256=6yEJ_agfKUqsV43wFsbMkcd8lcLJC0uqbo4izjdZ3rU,5294
+unstructured_ingest/processes/connectors/delta_table.py,sha256=Y3yJPfwTyDdv7dqn54ZLZ4DBjg9OF2rXuUaNfbPCkvc,9993
+unstructured_ingest/processes/connectors/discord.py,sha256=CD-SBECMdr3pnmqbPvBMyPU2cBroXUhyW6F7L3laP6A,5348
 unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
-unstructured_ingest/processes/connectors/gitlab.py,sha256=6h1CdqznJmzeWxGfXrFLdNdT23PExGnUMMX7usK_4Kk,10013
+unstructured_ingest/processes/connectors/gitlab.py,sha256=Fdq6_lk-By1JDmLGVjoKJkaHESiKTZsbvoHhMsljlE0,10114
 unstructured_ingest/processes/connectors/google_drive.py,sha256=jQb4_rKL_tJg7s7m-H8nrvc0GKwxiubtg8KL3-ZIGPM,35304
-unstructured_ingest/processes/connectors/jira.py,sha256=a7OuVi4RFfr22Tqgk60lwmtWTRBw2fI1m8KPqfA8Ffo,18504
+unstructured_ingest/processes/connectors/jira.py,sha256=BuZwExmdcI-R_MGPUwm8TnFh2jEjjwkyA1T51Bgqh-U,18558
 unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
-unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9VhzKZuuQUpDS-cVNAQ2g,7426
+unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icGvGAo4hfa-4fzbYajmMSo,7450
 unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
-unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAanR3zNYJ47_Njw_uV-uj3_U,14324
+unstructured_ingest/processes/connectors/mongodb.py,sha256=OmbbmE_pSDVjrn1YfjrQMTTs6JhTOJUU5d_jULxgtaM,14545
 unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
-unstructured_ingest/processes/connectors/onedrive.py,sha256=JIADpc31PI9Yzr0raF6bSqzes2jhfcniUzew1aKVWeI,19305
-unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49aZeSlsvVrl8kh_lR_ESgQw,9275
+unstructured_ingest/processes/connectors/onedrive.py,sha256=nZt6hsFMlURgB5-BioFBzJySieRVU8xi99QhOCtorxQ,19343
+unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
 unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
 unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
-unstructured_ingest/processes/connectors/salesforce.py,sha256=OaKEWCqZrirHqFJ650K5jSPwYlWefPOapas8Y-4D9oc,11661
+unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
 unstructured_ingest/processes/connectors/sharepoint.py,sha256=vIfLIactYXcdetccHvKlYOay6NOzGj2X0CkXbY0KuRo,6213
-unstructured_ingest/processes/connectors/slack.py,sha256=EkFj9PcAu5_gF2xLogikKDADLbJYq-_jvchzYrTdLO4,9224
+unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
 unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
 unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
 unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=8a9HTcRWA6IuswSD632b_uZSO6Dax_0rUYnflqktcek,226
 unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
 unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
-unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=yT5JFbVzAEOJsKjfGH8KG3eQfKaTNFEsg_FVDPVK7Xs,8271
+unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=EltntY0i9t7N7__ePfEUanWO9wLy_gxNd48KXz1TxUw,8373
 unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=WhGTp6aRTLSdc4GChCL4mz2b-IanderW8j1IqezX6YA,2958
 unstructured_ingest/processes/connectors/databricks/volumes_azure.py,sha256=pF2d6uAIbwJJUeOIG5xknUMCGc5d9Aztmc2776wp-a0,3740
 unstructured_ingest/processes/connectors/databricks/volumes_gcp.py,sha256=y9AvVl6PtnIxlTlrPj_wyHBDBRJNq3uoTOuZwTryNg8,2994
@@ -103,7 +103,7 @@ unstructured_ingest/processes/connectors/duckdb/base.py,sha256=bTLhilg6mgERNCpee
 unstructured_ingest/processes/connectors/duckdb/duckdb.py,sha256=jsmibTd_yvYzkCT05HhCJvplyobtjfNILC3zyTuCcVY,4464
 unstructured_ingest/processes/connectors/duckdb/motherduck.py,sha256=Atr2MjJQGFGWh5aeiQsLpUbFw-aCZH-ABI1LprDh5VI,4727
 unstructured_ingest/processes/connectors/elasticsearch/__init__.py,sha256=M8mmBWoP6J5R3hxg6BQUMexYlTUxUxdBoIcjUop8yt8,826
-unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=iNedi-JVkAvdF15CbKwVRwXJazyST6ha3zcNyyGwVmQ,19003
+unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=iCC4AP5s8YYa8sMldTFcHp9sfUK1LdQTD0oqXnvklwM,19305
 unstructured_ingest/processes/connectors/elasticsearch/opensearch.py,sha256=wggHvw8h-X0-3WPNxj9rt2xkrE7Pv7CV0B0KzTMzBB4,6944
 unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W8UX0jQbMxBg0ZfITPbEXU7Bwdo1BfI,1843
 unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
@@ -129,7 +129,7 @@ unstructured_ingest/processes/connectors/lancedb/lancedb.py,sha256=qyco2ZPcE-MqE
 unstructured_ingest/processes/connectors/lancedb/local.py,sha256=rhRxoK-h1Q0wdRhUq8Y5y48fbkvvCcIbA4gZvtteHq4,1263
 unstructured_ingest/processes/connectors/notion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/connectors/notion/client.py,sha256=wmlkbuER2crKjrqYm1dJwrCe8qH9gX-R4yckg5GQ41I,13174
-unstructured_ingest/processes/connectors/notion/connector.py,sha256=6dPNQQNkEaFMbztVe6fkuB29hnCSBpm3gkacoH6VTNA,13310
+unstructured_ingest/processes/connectors/notion/connector.py,sha256=WdhnB9vZs5nenQJ-DNx4SV7p2-jcQVp3Fe6nxS7Y9SI,13426
 unstructured_ingest/processes/connectors/notion/helpers.py,sha256=Z4qjdsdFyrgE0KwE8gDZdZ88LsP_NYQit697Po6w878,16424
 unstructured_ingest/processes/connectors/notion/interfaces.py,sha256=SrTT-9c0nvk0fMqVgudYF647r04AdMKi6wkIkMy7Szw,563
 unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py,sha256=cfdIJuZDFcF3w84sTyYqZ8vXnSMfMABXFc100r3g5kU,63
@@ -207,7 +207,7 @@ unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py,sha256=_
 unstructured_ingest/processes/connectors/sql/postgres.py,sha256=kDIL8Cj45EDpKqit1_araRpP4v3cb__QbYqoINg9f2k,5403
 unstructured_ingest/processes/connectors/sql/singlestore.py,sha256=B46lpvyAj1AArpACi9MXbXD1-52zF6Dsj3RJtD1g4r0,5955
 unstructured_ingest/processes/connectors/sql/snowflake.py,sha256=dkGIFz_VIVhew_FjbuO8r3cVluw7VIUdvV6VjkAItP8,11369
-unstructured_ingest/processes/connectors/sql/sql.py,sha256=e2GKJXBKAPpp-H14PMLMUXSa6pfKctEAVOlH9JqfHF4,15885
+unstructured_ingest/processes/connectors/sql/sql.py,sha256=jIwAck_vFlsMczH7BOyI-iZC_lrLAV-1eqmGtKkPNQc,16170
 unstructured_ingest/processes/connectors/sql/sqlite.py,sha256=V3OfRrXGGhTa_R2FPA-ysn95HHCv9x_VEBKVDsSGsbs,5549
 unstructured_ingest/processes/connectors/sql/vastdb.py,sha256=trhvUBumDmj2rLjmxFBKw9L9wF6ZpssF0wfmRaG97H0,9803
 unstructured_ingest/processes/connectors/weaviate/__init__.py,sha256=1Vnz8hm_Cf3NkQUTz5ZD4QkbLSVql4UvRoY2j2FnC9k,853
@@ -217,7 +217,7 @@ unstructured_ingest/processes/connectors/weaviate/local.py,sha256=4fgZsL9dgnWuaS
 unstructured_ingest/processes/connectors/weaviate/weaviate.py,sha256=yB67gxvo3X0UaP_mNeB0HbSWXst7ur0E2QKwLA0gIS4,13647
 unstructured_ingest/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-edHgvCFjFao94uR0O5Yf4dT9NCmSk,11952
-unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=j5zS_7vJmYDEQtysz_UfwIUH65gc4r-Zjc1LocJr9FM,9033
+unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
 unstructured_ingest/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
 unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
 unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
 unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
 unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
-unstructured_ingest-1.0.40.dist-info/METADATA,sha256=URvUgQtqnRmftrPAeq9QAAWgvskHVwUwnPE0m07iE7M,8747
-unstructured_ingest-1.0.40.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-unstructured_ingest-1.0.40.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-1.0.40.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-1.0.40.dist-info/RECORD,,
+unstructured_ingest-1.0.44.dist-info/METADATA,sha256=PR_LHUUQP-2oayEmsoTGblqWKPmJt46QtijI7y-zni0,8795
+unstructured_ingest-1.0.44.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+unstructured_ingest-1.0.44.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-1.0.44.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-1.0.44.dist-info/RECORD,,

{unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

unstructured-ingest 1.0.40__py3-none-any.whl → 1.0.44__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 1.0.40py3-none-any.whl → 1.0.44py3-none-any.whl