PyPI - unstructured-ingest - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

unstructured-ingest 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -2,6 +2,8 @@ from __future__ import annotations
 import unstructured_ingest.v2.processes.connectors.databricks  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.fsspec  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.kafka  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.qdrant  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.sql  # noqa: F401
 from unstructured_ingest.v2.processes.connector_registry import (
     add_destination_entry,
@@ -12,16 +14,20 @@ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
 from .airtable import airtable_source_entry
 from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
 from .astradb import astra_db_destination_entry, astra_db_source_entry
-from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
-from .azure_cognitive_search import azure_cognitive_search_destination_entry
+from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
+from .azure_ai_search import azure_ai_search_destination_entry
 from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
 from .chroma import chroma_destination_entry
+from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
+from .confluence import confluence_source_entry
 from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
 from .couchbase import couchbase_destination_entry, couchbase_source_entry
 from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
 from .delta_table import delta_table_destination_entry
 from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
 from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
+from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
+from .gitlab import gitlab_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
 from .google_drive import google_drive_source_entry
 from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
@@ -33,7 +39,7 @@ from .milvus import milvus_destination_entry
 from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
 from .mongodb import mongodb_destination_entry, mongodb_source_entry
 from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
-from .onedrive import onedrive_source_entry
+from .onedrive import onedrive_destination_entry, onedrive_source_entry
 from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
 from .opensearch import opensearch_destination_entry, opensearch_source_entry
 from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -72,6 +78,7 @@ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
 add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
 add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
+add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
 add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
 add_destination_entry(
@@ -90,8 +97,8 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
 add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
 add_destination_entry(
-    destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
-    entry=azure_cognitive_search_destination_entry,
+    destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
+    entry=azure_ai_search_destination_entry,
 )
 add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
@@ -99,4 +106,8 @@ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entr
 add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
+add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
 add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
+add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)

unstructured_ingest/v2/processes/connectors/airtable.py CHANGED Viewed

@@ -12,11 +12,11 @@ from unstructured_ingest.v2.interfaces import (
     ConnectionConfig,
     Downloader,
     DownloaderConfig,
+    DownloadResponse,
     FileData,
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
-    download_responses,
 )
 from unstructured_ingest.v2.processes.connector_registry import (
     SourceRegistryEntry,
@@ -214,7 +214,7 @@ class AirtableDownloader(Downloader):
         row_dict.update(table_row["fields"])
         return row_dict
-    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
         table_contents = self.get_table_contents(table_meta=table_meta)
         df = pandas.DataFrame.from_dict(

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
 )
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -129,11 +130,6 @@ class AstraDBIndexerConfig(IndexerConfig):
         "numbers, and underscores."
     )
     keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
-    namespace: Optional[str] = Field(
-        default=None,
-        description="The Astra DB connection namespace.",
-        deprecated="Please use 'keyspace' instead.",
-    )
     batch_size: int = Field(default=20, description="Number of records per batch")
@@ -147,21 +143,17 @@ class AstraDBUploaderConfig(UploaderConfig):
         "Note that the collection name must only include letters, "
         "numbers, and underscores."
     )
-    embedding_dimension: int = Field(
-        default=384, description="The dimensionality of the embeddings"
-    )
     keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
-    namespace: Optional[str] = Field(
-        default=None,
-        description="The Astra DB connection namespace.",
-        deprecated="Please use 'keyspace' instead.",
-    )
     requested_indexing_policy: Optional[dict[str, Any]] = Field(
         default=None,
         description="The indexing policy to use for the collection.",
         examples=['{"deny": ["metadata"]}'],
     )
     batch_size: int = Field(default=20, description="Number of records per batch")
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
@@ -173,7 +165,7 @@ class AstraDBIndexer(Indexer):
         return get_astra_collection(
             connection_config=self.connection_config,
             collection_name=self.index_config.collection_name,
-            keyspace=self.index_config.keyspace or self.index_config.namespace,
+            keyspace=self.index_config.keyspace,
         )
     def precheck(self) -> None:
@@ -223,7 +215,7 @@ class AstraDBIndexer(Indexer):
                 additional_metadata={
                     "ids": list(batch),
                     "collection_name": self.index_config.collection_name,
-                    "keyspace": self.index_config.keyspace or self.index_config.namespace,
+                    "keyspace": self.index_config.keyspace,
                 },
             )
             yield fd
@@ -309,10 +301,11 @@ class AstraDBUploadStager(UploadStager):
         default_factory=lambda: AstraDBUploadStagerConfig()
     )
-    def conform_dict(self, element_dict: dict) -> dict:
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         return {
             "$vector": element_dict.pop("embeddings", None),
             "content": element_dict.pop("text", None),
+            RECORD_ID_LABEL: file_data.identifier,
             "metadata": element_dict,
         }
@@ -328,10 +321,15 @@ class AstraDBUploadStager(UploadStager):
             elements_contents = json.load(elements_file)
         conformed_elements = []
         for element in elements_contents:
-            conformed_elements.append(self.conform_dict(element_dict=element))
-        output_path = Path(output_dir) / Path(f"{output_filename}.json")
+            conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
+        output_filename_path = Path(output_filename)
+        if output_filename_path.suffix == ".json":
+            output_path = Path(output_dir) / output_filename_path
+        else:
+            output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file)
+            json.dump(conformed_elements, output_file, indent=2)
         return output_path
@@ -346,7 +344,7 @@ class AstraDBUploader(Uploader):
             get_astra_collection(
                 connection_config=self.connection_config,
                 collection_name=self.upload_config.collection_name,
-                keyspace=self.upload_config.keyspace or self.upload_config.namespace,
+                keyspace=self.upload_config.keyspace,
             )
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
@@ -357,7 +355,19 @@ class AstraDBUploader(Uploader):
         return get_astra_collection(
             connection_config=self.connection_config,
             collection_name=self.upload_config.collection_name,
-            keyspace=self.upload_config.keyspace or self.upload_config.namespace,
+            keyspace=self.upload_config.keyspace,
+        )
+    def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
+        logger.debug(
+            f"deleting records from collection {collection.name} "
+            f"with {self.upload_config.record_id_key} "
+            f"set to {file_data.identifier}"
+        )
+        delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
+        delete_resp = collection.delete_many(filter=delete_filter)
+        logger.debug(
+            f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
         )
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -371,6 +381,8 @@ class AstraDBUploader(Uploader):
         astra_db_batch_size = self.upload_config.batch_size
         collection = self.get_collection()
+        self.delete_by_record_id(collection=collection, file_data=file_data)
         for chunk in batch_generator(elements_dict, astra_db_batch_size):
             collection.insert_many(chunk)

unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} RENAMED Viewed

@@ -9,6 +9,7 @@ from pydantic import Field, Secret
 from unstructured_ingest.error import DestinationConnectionError, WriteError
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
@@ -26,18 +27,18 @@ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
 if TYPE_CHECKING:
     from azure.search.documents import SearchClient
+    from azure.search.documents.indexes import SearchIndexClient
+CONNECTOR_TYPE = "azure_ai_search"
-CONNECTOR_TYPE = "azure_cognitive_search"
-class AzureCognitiveSearchAccessConfig(AccessConfig):
-    azure_cognitive_search_key: str = Field(
+class AzureAISearchAccessConfig(AccessConfig):
+    azure_ai_search_key: str = Field(
         alias="key", description="Credential that is used for authenticating to an Azure service"
     )
-class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
+class AzureAISearchConnectionConfig(ConnectionConfig):
     endpoint: str = Field(
         description="The URL endpoint of an Azure AI (Cognitive) search service. "
         "In the form of https://{{service_name}}.search.windows.net"
@@ -45,10 +46,10 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
     index: str = Field(
         description="The name of the Azure AI (Cognitive) Search index to connect to."
     )
-    access_config: Secret[AzureCognitiveSearchAccessConfig]
+    access_config: Secret[AzureAISearchAccessConfig]
-    @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
-    def generate_client(self) -> "SearchClient":
+    @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
+    def get_search_client(self) -> "SearchClient":
         from azure.core.credentials import AzureKeyCredential
         from azure.search.documents import SearchClient
@@ -56,27 +57,43 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
             endpoint=self.endpoint,
             index_name=self.index,
             credential=AzureKeyCredential(
-                self.access_config.get_secret_value().azure_cognitive_search_key
+                self.access_config.get_secret_value().azure_ai_search_key
+            ),
+        )
+    @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
+    def get_search_index_client(self) -> "SearchIndexClient":
+        from azure.core.credentials import AzureKeyCredential
+        from azure.search.documents.indexes import SearchIndexClient
+        return SearchIndexClient(
+            endpoint=self.endpoint,
+            credential=AzureKeyCredential(
+                self.access_config.get_secret_value().azure_ai_search_key
             ),
         )
-class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
+class AzureAISearchUploadStagerConfig(UploadStagerConfig):
     pass
-class AzureCognitiveSearchUploaderConfig(UploaderConfig):
+class AzureAISearchUploaderConfig(UploaderConfig):
     batch_size: int = Field(default=100, description="Number of records per batch")
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
 @dataclass
-class AzureCognitiveSearchUploadStager(UploadStager):
-    upload_stager_config: AzureCognitiveSearchUploadStagerConfig = field(
-        default_factory=lambda: AzureCognitiveSearchUploadStagerConfig()
+class AzureAISearchUploadStager(UploadStager):
+    upload_stager_config: AzureAISearchUploadStagerConfig = field(
+        default_factory=lambda: AzureAISearchUploadStagerConfig()
     )
     @staticmethod
-    def conform_dict(data: dict) -> dict:
+    def conform_dict(data: dict, file_data: FileData) -> dict:
         """
         updates the dictionary that is from each Element being converted into a dict/json
         into a dictionary that conforms to the schema expected by the
@@ -84,6 +101,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
         """
         data["id"] = str(uuid.uuid4())
+        data[RECORD_ID_LABEL] = file_data.identifier
         if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
             data["metadata"]["coordinates"]["points"] = json.dumps(points)
@@ -124,6 +142,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
     def run(
         self,
+        file_data: FileData,
         elements_filepath: Path,
         output_dir: Path,
         output_filename: str,
@@ -132,23 +151,59 @@ class AzureCognitiveSearchUploadStager(UploadStager):
         with open(elements_filepath) as elements_file:
             elements_contents = json.load(elements_file)
-        conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
+        conformed_elements = [
+            self.conform_dict(data=element, file_data=file_data) for element in elements_contents
+        ]
         output_path = Path(output_dir) / Path(f"{output_filename}.json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         with open(output_path, "w") as output_file:
-            json.dump(conformed_elements, output_file)
+            json.dump(conformed_elements, output_file, indent=2)
         return output_path
 @dataclass
-class AzureCognitiveSearchUploader(Uploader):
-    upload_config: AzureCognitiveSearchUploaderConfig
-    connection_config: AzureCognitiveSearchConnectionConfig
+class AzureAISearchUploader(Uploader):
+    upload_config: AzureAISearchUploaderConfig
+    connection_config: AzureAISearchConnectionConfig
     connector_type: str = CONNECTOR_TYPE
+    def query_docs(self, record_id: str, index_key: str) -> list[str]:
+        client = self.connection_config.get_search_client()
+        results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
+        return [result[index_key] for result in results]
+    def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
+        logger.debug(
+            f"deleting any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from azure cognitive search index: {self.connection_config.index}"
+        )
+        doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
+        if not doc_ids_to_delete:
+            return
+        client: SearchClient = self.connection_config.get_search_client()
+        results = client.delete_documents(
+            documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
+        )
+        errors = []
+        success = []
+        for result in results:
+            if result.succeeded:
+                success.append(result)
+            else:
+                errors.append(result)
+        logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
+        if errors:
+            raise WriteError(
+                ", ".join(
+                    [f"[{error.status_code}] {error.error_message}" for error in errors],
+                ),
+            )
     @DestinationConnectionError.wrap
-    @requires_dependencies(["azure"], extras="azure-cognitive-search")
-    def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
+    @requires_dependencies(["azure"], extras="azure-ai-search")
+    def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
         import azure.core.exceptions
         logger.info(
@@ -156,7 +211,7 @@ class AzureCognitiveSearchUploader(Uploader):
             f"index at {self.connection_config.index}",
         )
         try:
-            results = self.connection_config.generate_client().upload_documents(
+            results = self.connection_config.get_search_client().upload_documents(
                 documents=elements_dict
             )
@@ -174,24 +229,42 @@ class AzureCognitiveSearchUploader(Uploader):
             raise WriteError(
                 ", ".join(
                     [
-                        f"{error.azure_cognitive_search_key}: "
+                        f"{error.azure_ai_search_key}: "
                         f"[{error.status_code}] {error.error_message}"
                         for error in errors
                     ],
                 ),
             )
+    def can_delete(self) -> bool:
+        search_index_client = self.connection_config.get_search_index_client()
+        index = search_index_client.get_index(name=self.connection_config.index)
+        index_fields = index.fields
+        record_id_fields = [
+            field for field in index_fields if field.name == self.upload_config.record_id_key
+        ]
+        if not record_id_fields:
+            return False
+        record_id_field = record_id_fields[0]
+        return record_id_field.filterable
+    def get_index_key(self) -> str:
+        search_index_client = self.connection_config.get_search_index_client()
+        index = search_index_client.get_index(name=self.connection_config.index)
+        index_fields = index.fields
+        key_fields = [field for field in index_fields if field.key]
+        if not key_fields:
+            raise ValueError("no key field found in index fields")
+        return key_fields[0].name
     def precheck(self) -> None:
         try:
-            client = self.connection_config.generate_client()
+            client = self.connection_config.get_search_client()
             client.get_document_count()
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def write_dict_wrapper(self, elements_dict):
-        return self.write_dict(elements_dict=elements_dict)
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         with path.open("r") as file:
             elements_dict = json.load(file)
@@ -201,17 +274,21 @@ class AzureCognitiveSearchUploader(Uploader):
             f" index at {str(self.connection_config.index)}"
             f" with batch size {str(self.upload_config.batch_size)}"
         )
+        if self.can_delete():
+            index_key = self.get_index_key()
+            self.delete_by_record_id(file_data=file_data, index_key=index_key)
+        else:
+            logger.warning("criteria for deleting previous content not met, skipping")
         batch_size = self.upload_config.batch_size
         for chunk in batch_generator(elements_dict, batch_size):
             self.write_dict(elements_dict=chunk)  # noqa: E203
-azure_cognitive_search_destination_entry = DestinationRegistryEntry(
-    connection_config=AzureCognitiveSearchConnectionConfig,
-    uploader=AzureCognitiveSearchUploader,
-    uploader_config=AzureCognitiveSearchUploaderConfig,
-    upload_stager=AzureCognitiveSearchUploadStager,
-    upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
+azure_ai_search_destination_entry = DestinationRegistryEntry(
+    connection_config=AzureAISearchConnectionConfig,
+    uploader=AzureAISearchUploader,
+    uploader_config=AzureAISearchUploaderConfig,
+    upload_stager=AzureAISearchUploadStager,
+    upload_stager_config=AzureAISearchUploadStagerConfig,
 )

unstructured_ingest/v2/processes/connectors/confluence.py ADDED Viewed

@@ -0,0 +1,195 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Generator, List, Optional
+from pydantic import Field, Secret
+from unstructured_ingest.error import SourceConnectionError
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    FileData,
+    FileDataSourceMetadata,
+    Indexer,
+    IndexerConfig,
+    SourceIdentifiers,
+)
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connector_registry import (
+    SourceRegistryEntry,
+)
+if TYPE_CHECKING:
+    from atlassian import Confluence
+CONNECTOR_TYPE = "confluence"
+class ConfluenceAccessConfig(AccessConfig):
+    api_token: str = Field(description="Confluence API token")
+class ConfluenceConnectionConfig(ConnectionConfig):
+    url: str = Field(description="URL of the Confluence instance")
+    user_email: str = Field(description="User email for authentication")
+    access_config: Secret[ConfluenceAccessConfig] = Field(
+        description="Access configuration for Confluence"
+    )
+    @requires_dependencies(["atlassian"], extras="confluence")
+    def get_client(self) -> "Confluence":
+        from atlassian import Confluence
+        access_configs = self.access_config.get_secret_value()
+        return Confluence(
+            url=self.url,
+            username=self.user_email,
+            password=access_configs.api_token,
+        )
+class ConfluenceIndexerConfig(IndexerConfig):
+    max_num_of_spaces: int = Field(500, description="Maximum number of spaces to index")
+    max_num_of_docs_from_each_space: int = Field(
+        100, description="Maximum number of documents to fetch from each space"
+    )
+    spaces: Optional[List[str]] = Field(None, description="List of specific space keys to index")
+@dataclass
+class ConfluenceIndexer(Indexer):
+    connection_config: ConfluenceConnectionConfig
+    index_config: ConfluenceIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> bool:
+        try:
+            # Attempt to retrieve a list of spaces with limit=1.
+            # This should only succeed if all creds are valid
+            client = self.connection_config.get_client()
+            client.get_all_spaces(limit=1)
+            logger.info("Connection to Confluence successful.")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
+            raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
+    def _get_space_ids(self) -> List[str]:
+        spaces = self.index_config.spaces
+        if spaces:
+            return spaces
+        else:
+            client = self.connection_config.get_client()
+            all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
+            space_ids = [space["key"] for space in all_spaces["results"]]
+            return space_ids
+    def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
+        client = self.connection_config.get_client()
+        pages = client.get_all_pages_from_space(
+            space=space_id,
+            start=0,
+            limit=self.index_config.max_num_of_docs_from_each_space,
+            expand=None,
+            content_type="page",
+            status=None,
+        )
+        doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
+        return doc_ids
+    def run(self) -> Generator[FileData, None, None]:
+        from time import time
+        space_ids = self._get_space_ids()
+        for space_id in space_ids:
+            doc_ids = self._get_docs_ids_within_one_space(space_id)
+            for doc in doc_ids:
+                doc_id = doc["doc_id"]
+                # Build metadata
+                metadata = FileDataSourceMetadata(
+                    date_processed=str(time()),
+                    url=f"{self.connection_config.url}/pages/{doc_id}",
+                    record_locator={
+                        "space_id": space_id,
+                        "document_id": doc_id,
+                    },
+                )
+                additional_metadata = {
+                    "space_id": space_id,
+                    "document_id": doc_id,
+                }
+                # Construct relative path and filename
+                filename = f"{doc_id}.html"
+                relative_path = str(Path(space_id) / filename)
+                source_identifiers = SourceIdentifiers(
+                    filename=filename,
+                    fullpath=relative_path,
+                    rel_path=relative_path,
+                )
+                file_data = FileData(
+                    identifier=doc_id,
+                    connector_type=self.connector_type,
+                    metadata=metadata,
+                    additional_metadata=additional_metadata,
+                    source_identifiers=source_identifiers,
+                )
+                yield file_data
+class ConfluenceDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class ConfluenceDownloader(Downloader):
+    connection_config: ConfluenceConnectionConfig
+    download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
+    connector_type: str = CONNECTOR_TYPE
+    def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
+        doc_id = file_data.identifier
+        try:
+            client = self.connection_config.get_client()
+            page = client.get_page_by_id(
+                page_id=doc_id,
+                expand="history.lastUpdated,version,body.view",
+            )
+        except Exception as e:
+            logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
+            raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
+        if not page:
+            raise ValueError(f"Page with ID {doc_id} does not exist.")
+        content = page["body"]["view"]["value"]
+        filepath = file_data.source_identifiers.relative_path
+        download_path = Path(self.download_dir) / filepath
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(download_path, "w", encoding="utf8") as f:
+            f.write(content)
+        # Update file_data with metadata
+        file_data.metadata.date_created = page["history"]["createdDate"]
+        file_data.metadata.date_modified = page["version"]["when"]
+        file_data.metadata.version = str(page["version"]["number"])
+        file_data.display_name = page["title"]
+        return self.generate_download_response(file_data=file_data, download_path=download_path)
+confluence_source_entry = SourceRegistryEntry(
+    connection_config=ConfluenceConnectionConfig,
+    indexer_config=ConfluenceIndexerConfig,
+    indexer=ConfluenceIndexer,
+    downloader_config=ConfluenceDownloaderConfig,
+    downloader=ConfluenceDownloader,
+)

unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl