PyPI - unstructured-ingest - Versions diffs - 1.0.37__py3-none-any.whl → 1.0.41__py3-none-any.whl - Mend

unstructured-ingest 1.0.37py3-none-any.whl → 1.0.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (27) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.37" # pragma: no cover
1	+ __version__ = "1.0.41" # pragma: no cover

unstructured_ingest/processes/connectors/airtable.py CHANGED Viewed

@@ -184,6 +184,7 @@ class AirtableIndexer(Indexer):
                     filename=str(Path(fullpath).name),
                     fullpath=fullpath,
                 ),
+                display_name=fullpath,
             )

unstructured_ingest/processes/connectors/astradb.py CHANGED Viewed

@@ -195,8 +195,10 @@ class AstraDBIndexer(Indexer):
         all_ids = self._get_doc_ids()
         ids = list(all_ids)
         id_batches = batch_generator(ids, self.index_config.batch_size)
         for batch in id_batches:
+            batch_items = [BatchItem(identifier=b) for b in batch]
+            display_name = (f"{self.index_config.collection_name}-{self.index_config.keyspace}"
+                            f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]")
             fd = AstraDBBatchFileData(
                 connector_type=CONNECTOR_TYPE,
                 metadata=FileDataSourceMetadata(
@@ -206,7 +208,8 @@ class AstraDBIndexer(Indexer):
                     collection_name=self.index_config.collection_name,
                     keyspace=self.index_config.keyspace,
                 ),
-                batch_items=[BatchItem(identifier=b) for b in batch],
+                batch_items=batch_items,
+                display_name=display_name,
             )
             yield fd

unstructured_ingest/processes/connectors/confluence.py CHANGED Viewed

@@ -33,6 +33,8 @@ from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicod
 if TYPE_CHECKING:
     from atlassian import Confluence
+    from bs4 import BeautifulSoup
+    from bs4.element import Tag
 CONNECTOR_TYPE = "confluence"
@@ -231,15 +233,33 @@ class ConfluenceIndexer(Indexer):
                     metadata=metadata,
                     additional_metadata=additional_metadata,
                     source_identifiers=source_identifiers,
+                    display_name=source_identifiers.fullpath,
                 )
                 yield file_data
-class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
+class ConfluenceDownloaderConfig(HtmlMixin, DownloaderConfig):
     max_num_metadata_permissions: int = Field(
         250, description="Approximate maximum number of permissions included in metadata"
     )
+    @requires_dependencies(["bs4"])
+    def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
+        from bs4.element import Tag
+        return [
+            element
+            for element in html_soup.find_all(
+                "a",
+                attrs={
+                    "class": "confluence-embedded-file",
+                    "data-linked-resource-type": "attachment",
+                    "href": True,
+                },
+            )
+            if isinstance(element, Tag)
+        ]
 @dataclass
 class ConfluenceDownloader(Downloader):

unstructured_ingest/processes/connectors/databricks/volumes.py CHANGED Viewed

@@ -133,14 +133,15 @@ class DatabricksVolumesIndexer(Indexer, ABC):
                 if rel_path.startswith("/"):
                     rel_path = rel_path[1:]
                 filename = Path(file_info.path).name
+                source_identifiers = SourceIdentifiers(
+                    filename=filename,
+                    rel_path=rel_path,
+                    fullpath=file_info.path,
+                )
                 yield FileData(
                     identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
                     connector_type=self.connector_type,
-                    source_identifiers=SourceIdentifiers(
-                        filename=filename,
-                        rel_path=rel_path,
-                        fullpath=file_info.path,
-                    ),
+                    source_identifiers=source_identifiers,
                     additional_metadata={
                         "catalog": self.index_config.catalog,
                         "path": file_info.path,
@@ -148,6 +149,7 @@ class DatabricksVolumesIndexer(Indexer, ABC):
                     metadata=FileDataSourceMetadata(
                         url=file_info.path, date_modified=str(file_info.modification_time)
                     ),
+                    display_name=source_identifiers.fullpath,
                 )
         except Exception as e:
             raise self.connection_config.wrap_error(e=e)

unstructured_ingest/processes/connectors/discord.py CHANGED Viewed

@@ -91,6 +91,7 @@ class DiscordIndexer(Indexer):
             connector_type=CONNECTOR_TYPE,
             source_identifiers=source_identifiers,
             metadata=metadata,
+            display_name=source_identifiers.fullpath,
         )

unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py CHANGED Viewed

@@ -199,17 +199,24 @@ class ElasticsearchIndexer(Indexer):
         all_ids = self._get_doc_ids()
         ids = list(all_ids)
         for batch in batch_generator(ids, self.index_config.batch_size):
+            batch_items = [BatchItem(identifier=b) for b in batch]
+            url = f"{self.connection_config.hosts[0]}/{self.index_config.index_name}"
+            display_name = (
+                f"url={url}, batch_size={len(batch_items)} "
+                f"ids={batch_items[0].identifier}..{batch_items[-1].identifier}"
+            )  # noqa: E501
             # Make sure the hash is always a positive number to create identified
             yield ElasticsearchBatchFileData(
                 connector_type=CONNECTOR_TYPE,
                 metadata=FileDataSourceMetadata(
-                    url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
+                    url=url,
                     date_processed=str(time()),
                 ),
                 additional_metadata=ElastisearchAdditionalMetadata(
                     index_name=self.index_config.index_name,
                 ),
-                batch_items=[BatchItem(identifier=b) for b in batch],
+                batch_items=batch_items,
+                display_name=display_name,
             )

unstructured_ingest/processes/connectors/gitlab.py CHANGED Viewed

@@ -190,21 +190,22 @@ class GitLabIndexer(Indexer):
                     "file_path": file["path"],
                     "ref": ref,
                 }
+                source_identifiers = SourceIdentifiers(
+                    fullpath=file["path"],
+                    filename=Path(file["path"]).name,
+                    rel_path=relative_path,
+                )
                 yield FileData(
                     identifier=file["id"],
                     connector_type=CONNECTOR_TYPE,
-                    source_identifiers=SourceIdentifiers(
-                        fullpath=file["path"],
-                        filename=Path(file["path"]).name,
-                        rel_path=relative_path,
-                    ),
+                    source_identifiers=source_identifiers,
                     metadata=FileDataSourceMetadata(
                         url=file["id"],
                         record_locator=record_locator,
                         permissions_data=[{"mode": file["mode"]}],
                     ),
                     additional_metadata={},
+                    display_name=source_identifiers.fullpath,
                 )

unstructured_ingest/processes/connectors/jira.py CHANGED Viewed

@@ -262,6 +262,7 @@ class JiraIndexer(Indexer):
             metadata=metadata,
             additional_metadata=issue.model_dump(),
             source_identifiers=source_identifiers,
+            display_name=source_identifiers.fullpath,
         )
         return file_data

unstructured_ingest/processes/connectors/local.py CHANGED Viewed

@@ -119,21 +119,21 @@ class LocalIndexer(Indexer):
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
         for file_path in self.list_files():
+            source_identifiers = SourceIdentifiers(
+                fullpath=str(file_path.resolve()),
+                filename=file_path.name,
+                rel_path=(
+                    str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[1:]
+                    if not self.index_config.path.is_file()
+                    else self.index_config.path.name
+                ),
+            )
             file_data = FileData(
                 identifier=str(file_path.resolve()),
                 connector_type=CONNECTOR_TYPE,
-                source_identifiers=SourceIdentifiers(
-                    fullpath=str(file_path.resolve()),
-                    filename=file_path.name,
-                    rel_path=(
-                        str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
-                            1:
-                        ]
-                        if not self.index_config.path.is_file()
-                        else self.index_config.path.name
-                    ),
-                ),
+                source_identifiers=source_identifiers,
                 metadata=self.get_file_metadata(path=file_path),
+                display_name=source_identifiers.fullpath,
             )
             yield file_data

unstructured_ingest/processes/connectors/milvus.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Generator, Optional, Union
+from typing import TYPE_CHECKING, Any, Generator, Optional
 from dateutil import parser
 from pydantic import Field, Secret
@@ -97,10 +97,16 @@ class MilvusUploadStager(UploadStager):
     def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         working_data = element_dict.copy()
-        if self.upload_stager_config.flatten_metadata and (
-            metadata := working_data.pop("metadata", None)
-        ):
-            working_data.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
+        if self.upload_stager_config.flatten_metadata:
+            metadata: dict[str, Any] = working_data.pop("metadata", {})
+            flattened_metadata = flatten_dict(
+                metadata,
+                separator="_",
+                flatten_lists=False,
+                remove_none=True,
+            )
+            working_data.update(flattened_metadata)
         # TODO: milvus sdk doesn't seem to support defaults via the schema yet,
         #  remove once that gets updated
@@ -154,6 +160,23 @@ class MilvusUploader(Uploader):
     upload_config: MilvusUploaderConfig
     connector_type: str = CONNECTOR_TYPE
+    def has_dynamic_fields_enabled(self) -> bool:
+        """Check if the target collection has dynamic fields enabled."""
+        try:
+            with self.get_client() as client:
+                collection_info = client.describe_collection(self.upload_config.collection_name)
+                # Check if dynamic field is enabled
+                # The schema info should contain enable_dynamic_field or enableDynamicField
+                schema_info = collection_info.get(
+                    "enable_dynamic_field",
+                    collection_info.get("enableDynamicField", False),
+                )
+                return bool(schema_info)
+        except Exception as e:
+            logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
+            return False
     @DestinationConnectionError.wrap
     def precheck(self):
         from pymilvus import MilvusException
@@ -164,6 +187,7 @@ class MilvusUploader(Uploader):
                     raise DestinationConnectionError(
                         f"Collection '{self.upload_config.collection_name}' does not exist"
                     )
         except MilvusException as milvus_exception:
             raise DestinationConnectionError(
                 f"failed to precheck Milvus: {str(milvus_exception.message)}"
@@ -193,16 +217,66 @@ class MilvusUploader(Uploader):
             )
     @requires_dependencies(["pymilvus"], extras="milvus")
-    def insert_results(self, data: Union[dict, list[dict]]):
+    def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
+        """
+        Conforms the provided data to the schema of the target Milvus collection.
+        - If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
+        - If dynamic fields are disabled, it filters out any fields not present in the schema.
+        """
+        dynamic_fields_enabled = self.has_dynamic_fields_enabled()
+        # If dynamic fields are enabled, 'languages' field needs to be a list
+        if dynamic_fields_enabled:
+            logger.debug("Dynamic fields enabled, ensuring 'languages' field is a list.")
+            prepared_data = []
+            for item in data:
+                new_item = item.copy()
+                if "languages" in new_item and isinstance(new_item["languages"], str):
+                    try:
+                        new_item["languages"] = json.loads(new_item["languages"])
+                    except (json.JSONDecodeError, TypeError):
+                        logger.warning(
+                            f"Could not JSON decode languages field: {new_item['languages']}. "
+                            "Leaving as string.",
+                        )
+                prepared_data.append(new_item)
+            return prepared_data
+        # If dynamic fields are not enabled, we need to filter out the metadata fields
+        # to avoid insertion errors for fields not defined in the schema
+        with self.get_client() as client:
+            collection_info = client.describe_collection(
+                self.upload_config.collection_name,
+            )
+        schema_fields = {
+            field["name"]
+            for field in collection_info.get("fields", [])
+            if not field.get("auto_id", False)
+        }
+        # Remove metadata fields that are not part of the base schema
+        filtered_data = []
+        for item in data:
+            filtered_item = {key: value for key, value in item.items() if key in schema_fields}
+            filtered_data.append(filtered_item)
+        return filtered_data
+    @requires_dependencies(["pymilvus"], extras="milvus")
+    def insert_results(self, data: list[dict]):
         from pymilvus import MilvusException
         logger.info(
             f"uploading {len(data)} entries to {self.connection_config.db_name} "
             f"db in collection {self.upload_config.collection_name}"
         )
+        prepared_data = self._prepare_data_for_insert(data=data)
         with self.get_client() as client:
             try:
-                res = client.insert(collection_name=self.upload_config.collection_name, data=data)
+                res = client.insert(
+                    collection_name=self.upload_config.collection_name, data=prepared_data
+                )
             except MilvusException as milvus_exception:
                 raise WriteError(
                     f"failed to upload records to Milvus: {str(milvus_exception.message)}"

unstructured_ingest/processes/connectors/mongodb.py CHANGED Viewed

@@ -149,6 +149,10 @@ class MongoDBIndexer(Indexer):
         for id_batch in batch_generator(ids, batch_size=batch_size):
             # Make sure the hash is always a positive number to create identifier
+            display_name = (
+                f"{self.index_config.database}.{self.index_config.collection}, "
+                f"batch {id_batch[0]}-{id_batch[-1]}"
+            )
             metadata = FileDataSourceMetadata(
                 date_processed=str(time()),
                 record_locator={
@@ -164,6 +168,7 @@ class MongoDBIndexer(Indexer):
                 additional_metadata=MongoDBAdditionalMetadata(
                     collection=self.index_config.collection, database=self.index_config.database
                 ),
+                display_name=display_name,
             )
             yield file_data

unstructured_ingest/processes/connectors/notion/connector.py CHANGED Viewed

@@ -174,6 +174,7 @@ class NotionIndexer(Indexer):
                 source_identifiers=source_identifiers,
                 metadata=metadata,
                 additional_metadata=additional_metadata,
+                display_name=source_identifiers.fullpath,
             )
         except Exception as e:
             logger.error(f"Error retrieving page {page_id}: {e}")
@@ -210,6 +211,7 @@ class NotionIndexer(Indexer):
                 source_identifiers=source_identifiers,
                 metadata=metadata,
                 additional_metadata=additional_metadata,
+                display_name=source_identifiers.fullpath,
             )
         except Exception as e:
             logger.error(f"Error retrieving database {database_id}: {e}")

unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py CHANGED Viewed

@@ -19,11 +19,11 @@ class OriginalSyncedBlock(BlockBase):
     @classmethod
     def from_dict(cls, data: dict):
         """Create OriginalSyncedBlock from dictionary data.
         Original blocks contain children content.
         """
         if "children" not in data:
-             raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
+            raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
         return cls(children=data["children"])
     def get_html(self) -> Optional[HtmlTag]:
@@ -38,7 +38,7 @@ class DuplicateSyncedBlock(BlockBase):
     @staticmethod
     def can_have_children() -> bool:
         """Check if duplicate synced blocks can have children.
         Duplicate blocks themselves don't have children directly fetched here,
         but they represent content that does, so Notion API might report has_children=True
         on the parent block object. The actual children are fetched from the original block.
@@ -48,7 +48,7 @@ class DuplicateSyncedBlock(BlockBase):
     @classmethod
     def from_dict(cls, data: dict):
         """Create DuplicateSyncedBlock from dictionary data.
         Duplicate blocks contain a 'synced_from' reference.
         """
         synced_from_data = data.get("synced_from")
@@ -63,7 +63,7 @@ class DuplicateSyncedBlock(BlockBase):
     def get_html(self) -> Optional[HtmlTag]:
         """Get HTML representation of the duplicate synced block.
         HTML representation might need fetching the original block's content,
         which is outside the scope of this simple data class.
         """
@@ -74,7 +74,7 @@ class SyncBlock(BlockBase):
     @staticmethod
     def can_have_children() -> bool:
         """Check if synced blocks can have children.
         Synced blocks (both original and duplicate) can conceptually have children.
         """
         return True
@@ -82,7 +82,7 @@ class SyncBlock(BlockBase):
     @classmethod
     def from_dict(cls, data: dict):
         """Create appropriate SyncedBlock subclass from dictionary data.
         Determine if it's a duplicate (has 'synced_from') or original (has 'children').
         """
         if data.get("synced_from") is not None:
@@ -99,10 +99,9 @@ class SyncBlock(BlockBase):
             # Consider logging a warning here if strictness is needed.
             return OriginalSyncedBlock(children=[])
     def get_html(self) -> Optional[HtmlTag]:
         """Get HTML representation of the synced block.
         The specific instance returned by from_dict (Original or Duplicate)
         will handle its own get_html logic.
         This method on the base SyncBlock might not be directly called.

unstructured_ingest/processes/connectors/onedrive.py CHANGED Viewed

@@ -223,6 +223,7 @@ class OnedriveIndexer(Indexer):
                 },
             ),
             additional_metadata=self.get_properties_sync(drive_item=drive_item),
+            display_name=server_path,
         )
     async def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:

unstructured_ingest/processes/connectors/outlook.py CHANGED Viewed

@@ -149,11 +149,11 @@ class OutlookIndexer(Indexer):
     def _message_to_file_data(self, message: "Message") -> FileData:
         fullpath = self._generate_fullpath(message)
+        source_identifiers = SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath))
         return FileData(
             identifier=message.id,
             connector_type=CONNECTOR_TYPE,
-            source_identifiers=SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath)),
+            source_identifiers=source_identifiers,
             metadata=FileDataSourceMetadata(
                 url=message.resource_url,
                 version=message.change_key,
@@ -178,6 +178,7 @@ class OutlookIndexer(Indexer):
                 "has_attachments": message.has_attachments,
                 "importance": message.importance,
             },
+            display_name=source_identifiers.fullpath,
         )
     def _generate_fullpath(self, message: "Message") -> Path:

unstructured_ingest/processes/connectors/pinecone.py CHANGED Viewed

@@ -240,7 +240,7 @@ class PineconeUploader(VectorDBUploader):
         destination_name: str = "unstructuredautocreated",
         destination_type: Literal["pod", "serverless"] = "serverless",
         serverless_cloud: str = "aws",
-        serverless_region: str = "us-west-2",
+        serverless_region: str = "us-east-1",
         pod_environment: str = "us-east1-gcp",
         pod_type: str = "p1.x1",
         pod_count: int = 1,

unstructured_ingest/processes/connectors/salesforce.py CHANGED Viewed

@@ -182,14 +182,15 @@ class SalesforceIndexer(Indexer):
                     record_with_extension = record["Id"] + self.get_file_extension(
                         record["attributes"]["type"]
                     )
+                    source_identifiers = SourceIdentifiers(
+                        filename=record_with_extension,
+                        fullpath=f"{record['attributes']['type']}/{record_with_extension}",
+                    )
                     files_list.append(
                         FileData(
                             connector_type=CONNECTOR_TYPE,
                             identifier=record["Id"],
-                            source_identifiers=SourceIdentifiers(
-                                filename=record_with_extension,
-                                fullpath=f"{record['attributes']['type']}/{record_with_extension}",
-                            ),
+                            source_identifiers=source_identifiers,
                             metadata=FileDataSourceMetadata(
                                 url=record["attributes"]["url"],
                                 version=str(parser.parse(record["SystemModstamp"]).timestamp()),
@@ -200,6 +201,7 @@ class SalesforceIndexer(Indexer):
                                 record_locator={"id": record["Id"]},
                             ),
                             additional_metadata={"record_type": record["attributes"]["type"]},
+                            display_name=source_identifiers.fullpath,
                         )
                     )
             except SalesforceMalformedRequest as e:

unstructured_ingest/processes/connectors/slack.py CHANGED Viewed

@@ -122,12 +122,13 @@ class SlackIndexer(Indexer):
         identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
         filename = identifier[:16]
+        source_identifiers = SourceIdentifiers(
+            filename=f"{filename}.xml", fullpath=f"{filename}.xml"
+        )
         return FileData(
             identifier=identifier,
             connector_type=CONNECTOR_TYPE,
-            source_identifiers=SourceIdentifiers(
-                filename=f"{filename}.xml", fullpath=f"{filename}.xml"
-            ),
+            source_identifiers=source_identifiers,
             metadata=FileDataSourceMetadata(
                 date_created=ts_oldest,
                 date_modified=ts_newest,
@@ -138,6 +139,7 @@ class SlackIndexer(Indexer):
                     "latest": ts_newest,
                 },
             ),
+            display_name=source_identifiers.fullpath,
         )
     @SourceConnectionError.wrap

unstructured_ingest/processes/connectors/sql/sql.py CHANGED Viewed

@@ -130,7 +130,11 @@ class SQLIndexer(Indexer, ABC):
                 (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
             )
         ]
         for batch in id_batches:
+            batch_items = [BatchItem(identifier=str(b)) for b in batch]
+            display_name = (f"{self.index_config.table_name}-{self.index_config.id_column}"
+                            f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]")
             # Make sure the hash is always a positive number to create identified
             yield SqlBatchFileData(
                 connector_type=self.connector_type,
@@ -140,7 +144,8 @@ class SQLIndexer(Indexer, ABC):
                 additional_metadata=SqlAdditionalMetadata(
                     table_name=self.index_config.table_name, id_column=self.index_config.id_column
                 ),
-                batch_items=[BatchItem(identifier=str(b)) for b in batch],
+                batch_items=batch_items,
+                display_name=display_name
             )

unstructured_ingest/processes/connectors/zendesk/zendesk.py CHANGED Viewed

@@ -86,12 +86,13 @@ class ZendeskIndexer(Indexer):
     async def get_tickets(self) -> AsyncGenerator[ZendeskFileData, None]:
         async with self.connection_config.get_client() as client:
             async for ticket in client.get_tickets():
+                source_identifiers = SourceIdentifiers(
+                    filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
+                )
                 yield ZendeskFileData(
                     identifier=str(ticket.id),
                     connector_type=self.connector_type,
-                    source_identifiers=SourceIdentifiers(
-                        filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
-                    ),
+                    source_identifiers=source_identifiers,
                     additional_metadata=ZendeskAdditionalMetadata(
                         item_type="ticket", content=ticket
                     ),
@@ -101,17 +102,19 @@ class ZendeskIndexer(Indexer):
                         date_modified=ticket.updated_at.isoformat() if ticket.updated_at else None,
                         date_processed=str(time()),
                     ),
+                    display_name=source_identifiers.fullpath,
                 )
     async def get_articles(self) -> AsyncGenerator[ZendeskFileData, None]:
         async with self.connection_config.get_client() as client:
             async for article in client.get_articles():
+                source_identifiers = SourceIdentifiers(
+                    filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
+                )
                 yield ZendeskFileData(
                     identifier=str(article.id),
                     connector_type=self.connector_type,
-                    source_identifiers=SourceIdentifiers(
-                        filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
-                    ),
+                    source_identifiers=source_identifiers,
                     additional_metadata=ZendeskAdditionalMetadata(
                         item_type="article", content=article
                     ),
@@ -123,6 +126,7 @@ class ZendeskIndexer(Indexer):
                         ),
                         date_processed=str(time()),
                     ),
+                    display_name=source_identifiers.fullpath,
                 )
     async def run_async(self, **kwargs: Any) -> AsyncGenerator[ZendeskFileData, None]:

unstructured_ingest/utils/html.py CHANGED Viewed

@@ -12,6 +12,7 @@ from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
     from bs4.element import Tag
     from requests import Session
@@ -96,7 +97,7 @@ class HtmlMixin(BaseModel):
         from bs4 import BeautifulSoup
         soup = BeautifulSoup(html, "html.parser")
-        tags = soup.find_all("a", href=True)
+        tags = self._find_hyperlink_tags(soup)
         hrefs = [
             tag["href"]
             for tag in tags
@@ -158,3 +159,15 @@ class HtmlMixin(BaseModel):
             )
             for url_to_download in urls_to_download
         ]
+    @requires_dependencies(["bs4"])
+    def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
+        """Find hyperlink tags in the HTML.
+        Overwrite this method to customize the tag search.
+        """
+        from bs4.element import Tag
+        return [
+            element for element in html_soup.find_all("a", href=True) if isinstance(element, Tag)
+        ]

{unstructured_ingest-1.0.37.dist-info → unstructured_ingest-1.0.41.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: unstructured_ingest
-Version: 1.0.37
+Version: 1.0.41
 Summary: Local ETL data pipeline to get data RAG ready
 Author-email: Unstructured Technologies <devops@unstructuredai.io>
 License-Expression: Apache-2.0

{unstructured_ingest-1.0.37.dist-info → unstructured_ingest-1.0.41.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=De73lzt6X-hjX65lK6tF1Rs23QRJQqTCx5Zn-JyPtFI,43
+unstructured_ingest/__version__.py,sha256=3Ji2m0XEMhPfEwuUPkV-t7O-5nZeYDtyM4a_GDFx804,43
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
 unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -62,37 +62,37 @@ unstructured_ingest/processes/filter.py,sha256=oc3SYukRYfzx8sdJqF3KxdwZcrA-1U8PT
 unstructured_ingest/processes/partitioner.py,sha256=Kn_BSFYvOkwo8fqThw_cOpgD0Um-AdoSqclZplcdNBA,10109
 unstructured_ingest/processes/uncompress.py,sha256=o9JL3Bza4KPUTmrB39-v_5SuK_fYwhwFAhjQi2Pm8h8,2426
 unstructured_ingest/processes/connectors/__init__.py,sha256=cR4ZH2dpPod7QR6OsgMx8X9kpFcEc1TVfQndUNoKGzI,6812
-unstructured_ingest/processes/connectors/airtable.py,sha256=smx5qBSUKwM8V6Xcc7ikrf8hYQUQ94YrB1L0WVeRDv0,9024
-unstructured_ingest/processes/connectors/astradb.py,sha256=Ob9wQgDxa6BXDPZBOqooNKQgvjIZcMwIe4fW3VlI7h8,18929
+unstructured_ingest/processes/connectors/airtable.py,sha256=dDZDKim8ON0yMHv-7cxutjllV4iM9x0RZg0yfP2wQpM,9063
+unstructured_ingest/processes/connectors/astradb.py,sha256=qqgI8ogW00pqxkncclFrkQDC1GWnkUQQ7cDGD9JRjdc,19198
 unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE2hGFfjGc_jNFzlUwiRlCtIkuu7tmnk,11524
 unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
-unstructured_ingest/processes/connectors/confluence.py,sha256=C62LVwZYk7H8RfiPb0mbxig2osW5u7KvHIlz4qOJU-0,21954
+unstructured_ingest/processes/connectors/confluence.py,sha256=DU4sv2KiBOJr3hZAJVP2J1vGD7B_eaC_DLzB_MLTtTU,22601
 unstructured_ingest/processes/connectors/couchbase.py,sha256=KCHoYDNya9B05NIB5D78zXoizFyfpJRepcYBe1nLSOs,12298
 unstructured_ingest/processes/connectors/delta_table.py,sha256=2DFox_Vzoopt_D3Jy3rCjrrTGMutG2INIrwCeoIohRY,7340
-unstructured_ingest/processes/connectors/discord.py,sha256=6yEJ_agfKUqsV43wFsbMkcd8lcLJC0uqbo4izjdZ3rU,5294
+unstructured_ingest/processes/connectors/discord.py,sha256=CD-SBECMdr3pnmqbPvBMyPU2cBroXUhyW6F7L3laP6A,5348
 unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
-unstructured_ingest/processes/connectors/gitlab.py,sha256=6h1CdqznJmzeWxGfXrFLdNdT23PExGnUMMX7usK_4Kk,10013
+unstructured_ingest/processes/connectors/gitlab.py,sha256=Fdq6_lk-By1JDmLGVjoKJkaHESiKTZsbvoHhMsljlE0,10114
 unstructured_ingest/processes/connectors/google_drive.py,sha256=jQb4_rKL_tJg7s7m-H8nrvc0GKwxiubtg8KL3-ZIGPM,35304
-unstructured_ingest/processes/connectors/jira.py,sha256=a7OuVi4RFfr22Tqgk60lwmtWTRBw2fI1m8KPqfA8Ffo,18504
+unstructured_ingest/processes/connectors/jira.py,sha256=BuZwExmdcI-R_MGPUwm8TnFh2jEjjwkyA1T51Bgqh-U,18558
 unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
-unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9VhzKZuuQUpDS-cVNAQ2g,7426
-unstructured_ingest/processes/connectors/milvus.py,sha256=Jr9cul7By03tGAPFnFBoqncnNWwbhKd-qbmkuqnin8U,8908
-unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAanR3zNYJ47_Njw_uV-uj3_U,14324
+unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icGvGAo4hfa-4fzbYajmMSo,7450
+unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
+unstructured_ingest/processes/connectors/mongodb.py,sha256=OmbbmE_pSDVjrn1YfjrQMTTs6JhTOJUU5d_jULxgtaM,14545
 unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
-unstructured_ingest/processes/connectors/onedrive.py,sha256=JIADpc31PI9Yzr0raF6bSqzes2jhfcniUzew1aKVWeI,19305
-unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49aZeSlsvVrl8kh_lR_ESgQw,9275
-unstructured_ingest/processes/connectors/pinecone.py,sha256=pSREUNsQqel6q1EFZsFWelg-uZgGubQY5m_6nVnBFKs,15090
+unstructured_ingest/processes/connectors/onedrive.py,sha256=nZt6hsFMlURgB5-BioFBzJySieRVU8xi99QhOCtorxQ,19343
+unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
+unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
 unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
-unstructured_ingest/processes/connectors/salesforce.py,sha256=OaKEWCqZrirHqFJ650K5jSPwYlWefPOapas8Y-4D9oc,11661
+unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
 unstructured_ingest/processes/connectors/sharepoint.py,sha256=vIfLIactYXcdetccHvKlYOay6NOzGj2X0CkXbY0KuRo,6213
-unstructured_ingest/processes/connectors/slack.py,sha256=EkFj9PcAu5_gF2xLogikKDADLbJYq-_jvchzYrTdLO4,9224
+unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
 unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
 unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
 unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=8a9HTcRWA6IuswSD632b_uZSO6Dax_0rUYnflqktcek,226
 unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
 unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
-unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=yT5JFbVzAEOJsKjfGH8KG3eQfKaTNFEsg_FVDPVK7Xs,8271
+unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=EltntY0i9t7N7__ePfEUanWO9wLy_gxNd48KXz1TxUw,8373
 unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=WhGTp6aRTLSdc4GChCL4mz2b-IanderW8j1IqezX6YA,2958
 unstructured_ingest/processes/connectors/databricks/volumes_azure.py,sha256=pF2d6uAIbwJJUeOIG5xknUMCGc5d9Aztmc2776wp-a0,3740
 unstructured_ingest/processes/connectors/databricks/volumes_gcp.py,sha256=y9AvVl6PtnIxlTlrPj_wyHBDBRJNq3uoTOuZwTryNg8,2994
@@ -103,7 +103,7 @@ unstructured_ingest/processes/connectors/duckdb/base.py,sha256=bTLhilg6mgERNCpee
 unstructured_ingest/processes/connectors/duckdb/duckdb.py,sha256=jsmibTd_yvYzkCT05HhCJvplyobtjfNILC3zyTuCcVY,4464
 unstructured_ingest/processes/connectors/duckdb/motherduck.py,sha256=Atr2MjJQGFGWh5aeiQsLpUbFw-aCZH-ABI1LprDh5VI,4727
 unstructured_ingest/processes/connectors/elasticsearch/__init__.py,sha256=M8mmBWoP6J5R3hxg6BQUMexYlTUxUxdBoIcjUop8yt8,826
-unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=iNedi-JVkAvdF15CbKwVRwXJazyST6ha3zcNyyGwVmQ,19003
+unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=iCC4AP5s8YYa8sMldTFcHp9sfUK1LdQTD0oqXnvklwM,19305
 unstructured_ingest/processes/connectors/elasticsearch/opensearch.py,sha256=wggHvw8h-X0-3WPNxj9rt2xkrE7Pv7CV0B0KzTMzBB4,6944
 unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W8UX0jQbMxBg0ZfITPbEXU7Bwdo1BfI,1843
 unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
@@ -129,7 +129,7 @@ unstructured_ingest/processes/connectors/lancedb/lancedb.py,sha256=qyco2ZPcE-MqE
 unstructured_ingest/processes/connectors/lancedb/local.py,sha256=rhRxoK-h1Q0wdRhUq8Y5y48fbkvvCcIbA4gZvtteHq4,1263
 unstructured_ingest/processes/connectors/notion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/connectors/notion/client.py,sha256=wmlkbuER2crKjrqYm1dJwrCe8qH9gX-R4yckg5GQ41I,13174
-unstructured_ingest/processes/connectors/notion/connector.py,sha256=6dPNQQNkEaFMbztVe6fkuB29hnCSBpm3gkacoH6VTNA,13310
+unstructured_ingest/processes/connectors/notion/connector.py,sha256=WdhnB9vZs5nenQJ-DNx4SV7p2-jcQVp3Fe6nxS7Y9SI,13426
 unstructured_ingest/processes/connectors/notion/helpers.py,sha256=Z4qjdsdFyrgE0KwE8gDZdZ88LsP_NYQit697Po6w878,16424
 unstructured_ingest/processes/connectors/notion/interfaces.py,sha256=SrTT-9c0nvk0fMqVgudYF647r04AdMKi6wkIkMy7Szw,563
 unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py,sha256=cfdIJuZDFcF3w84sTyYqZ8vXnSMfMABXFc100r3g5kU,63
@@ -166,7 +166,7 @@ unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py,sh
 unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py,sha256=qvc4orjP2XcbaeBWor-a3xAEglLkyb-epknm7SXgU1E,992
 unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py,sha256=St43RmpefAzDwJKTwz2CdGVm-xeUwHkYgtQtLYQbnw0,1661
 unstructured_ingest/processes/connectors/notion/types/blocks/quote.py,sha256=yl7npmdcO6oFNgTNGVN_Ihvzexv12Xwg1r4NWAOjILQ,1176
-unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py,sha256=aHu4yg8N1EDqZmMEHK7dd7fiQ8Mc8otHQLJPRDbkaT8,4049
+unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py,sha256=aSfFxJKYx1qylOJHwiS_ZAu5pQ-YQZqJM20KGHUvx48,3991
 unstructured_ingest/processes/connectors/notion/types/blocks/table.py,sha256=eYUlRp4uCwjy_eB0mLh7MGMe1qrr_hnOxXS5RfUM2DQ,1724
 unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py,sha256=bR5DdecXFz468okM5WOs10DK8_14Dj7OCLSRusMZzsk,534
 unstructured_ingest/processes/connectors/notion/types/blocks/template.py,sha256=bq2Vh2X7ptpofs9OZnATHySZe2DzbOLsNNfpEI70NgM,968
@@ -207,7 +207,7 @@ unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py,sha256=_
 unstructured_ingest/processes/connectors/sql/postgres.py,sha256=kDIL8Cj45EDpKqit1_araRpP4v3cb__QbYqoINg9f2k,5403
 unstructured_ingest/processes/connectors/sql/singlestore.py,sha256=B46lpvyAj1AArpACi9MXbXD1-52zF6Dsj3RJtD1g4r0,5955
 unstructured_ingest/processes/connectors/sql/snowflake.py,sha256=dkGIFz_VIVhew_FjbuO8r3cVluw7VIUdvV6VjkAItP8,11369
-unstructured_ingest/processes/connectors/sql/sql.py,sha256=e2GKJXBKAPpp-H14PMLMUXSa6pfKctEAVOlH9JqfHF4,15885
+unstructured_ingest/processes/connectors/sql/sql.py,sha256=lqMOUflEXKWhzwuPStrShBn_kbTebkMr4IaIEO-a5PM,16151
 unstructured_ingest/processes/connectors/sql/sqlite.py,sha256=V3OfRrXGGhTa_R2FPA-ysn95HHCv9x_VEBKVDsSGsbs,5549
 unstructured_ingest/processes/connectors/sql/vastdb.py,sha256=trhvUBumDmj2rLjmxFBKw9L9wF6ZpssF0wfmRaG97H0,9803
 unstructured_ingest/processes/connectors/weaviate/__init__.py,sha256=1Vnz8hm_Cf3NkQUTz5ZD4QkbLSVql4UvRoY2j2FnC9k,853
@@ -217,7 +217,7 @@ unstructured_ingest/processes/connectors/weaviate/local.py,sha256=4fgZsL9dgnWuaS
 unstructured_ingest/processes/connectors/weaviate/weaviate.py,sha256=yB67gxvo3X0UaP_mNeB0HbSWXst7ur0E2QKwLA0gIS4,13647
 unstructured_ingest/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-edHgvCFjFao94uR0O5Yf4dT9NCmSk,11952
-unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=j5zS_7vJmYDEQtysz_UfwIUH65gc4r-Zjc1LocJr9FM,9033
+unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
 unstructured_ingest/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
 unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -226,13 +226,13 @@ unstructured_ingest/utils/compression.py,sha256=_BkFREoa0fkJ6z-1lY76HCmy8mLymbPC
 unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
 unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
 unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
-unstructured_ingest/utils/html.py,sha256=0WduP8tI5S3nHFQi6XHNPHgsIC9j3iWwyIayX9gDLiE,6386
+unstructured_ingest/utils/html.py,sha256=78ou1vVZ0SJ3c6-Nmxg2iR5MoqubJTvwiuTNMtSFDh4,6816
 unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
 unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
 unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
 unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
-unstructured_ingest-1.0.37.dist-info/METADATA,sha256=wct0um6qunVNGSNozJ0a3UatsfCHDyXG7p9XMNBCTcU,8747
-unstructured_ingest-1.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-unstructured_ingest-1.0.37.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-1.0.37.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-1.0.37.dist-info/RECORD,,
+unstructured_ingest-1.0.41.dist-info/METADATA,sha256=qZ_2gXsOaoCUdKNUCrrNMpeIs2zjwRX_h2uyinBYRxk,8747
+unstructured_ingest-1.0.41.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+unstructured_ingest-1.0.41.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-1.0.41.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-1.0.41.dist-info/RECORD,,

{unstructured_ingest-1.0.37.dist-info → unstructured_ingest-1.0.41.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-1.0.37.dist-info → unstructured_ingest-1.0.41.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-1.0.37.dist-info → unstructured_ingest-1.0.41.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

unstructured-ingest 1.0.37__py3-none-any.whl → 1.0.41__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 1.0.37py3-none-any.whl → 1.0.41py3-none-any.whl