PyPI - unstructured-ingest - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

unstructured-ingest 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -1,31 +1,50 @@
+import copy
+import csv
+import hashlib
 import json
+import sys
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional
+from time import time
+from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
 from unstructured_ingest import __name__ as integration_name
 from unstructured_ingest.__version__ import __version__ as integration_version
-from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
 from unstructured_ingest.utils.data_prep import batch_generator
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
+    Indexer,
+    IndexerConfig,
     Uploader,
     UploaderConfig,
     UploadStager,
     UploadStagerConfig,
+    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
+    SourceRegistryEntry,
 )
 if TYPE_CHECKING:
+    from astrapy import AsyncCollection as AstraDBAsyncCollection
     from astrapy import Collection as AstraDBCollection
+    from astrapy import DataAPIClient as AstraDBClient
 CONNECTOR_TYPE = "astradb"
@@ -37,14 +56,253 @@ class AstraDBAccessConfig(AccessConfig):
 class AstraDBConnectionConfig(ConnectionConfig):
-    connection_type: str = Field(default=CONNECTOR_TYPE, init=False)
+    connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
     access_config: Secret[AstraDBAccessConfig]
+    @requires_dependencies(["astrapy"], extras="astradb")
+    def get_client(self) -> "AstraDBClient":
+        from astrapy import DataAPIClient as AstraDBClient
+        # Create a client object to interact with the Astra DB
+        # caller_name/version for Astra DB tracking
+        return AstraDBClient(
+            caller_name=integration_name,
+            caller_version=integration_version,
+        )
+def get_astra_collection(
+    connection_config: AstraDBConnectionConfig,
+    collection_name: str,
+    keyspace: str,
+) -> "AstraDBCollection":
+    # Build the Astra DB object.
+    access_configs = connection_config.access_config.get_secret_value()
+    # Create a client object to interact with the Astra DB
+    # caller_name/version for Astra DB tracking
+    client = connection_config.get_client()
+    # Get the database object
+    astra_db = client.get_database(
+        api_endpoint=access_configs.api_endpoint,
+        token=access_configs.token,
+        keyspace=keyspace,
+    )
+    # Connect to the collection
+    astra_db_collection = astra_db.get_collection(name=collection_name)
+    return astra_db_collection
+async def get_async_astra_collection(
+    connection_config: AstraDBConnectionConfig,
+    collection_name: str,
+    keyspace: str,
+) -> "AstraDBAsyncCollection":
+    # Build the Astra DB object.
+    access_configs = connection_config.access_config.get_secret_value()
+    # Create a client object to interact with the Astra DB
+    client = connection_config.get_client()
+    # Get the async database object
+    async_astra_db = client.get_async_database(
+        api_endpoint=access_configs.api_endpoint,
+        token=access_configs.token,
+        keyspace=keyspace,
+    )
+    # Get async collection from AsyncDatabase
+    async_astra_db_collection = await async_astra_db.get_collection(name=collection_name)
+    return async_astra_db_collection
 class AstraDBUploadStagerConfig(UploadStagerConfig):
     pass
+class AstraDBIndexerConfig(IndexerConfig):
+    collection_name: str = Field(
+        description="The name of the Astra DB collection. "
+        "Note that the collection name must only include letters, "
+        "numbers, and underscores."
+    )
+    keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
+    namespace: Optional[str] = Field(
+        default=None,
+        description="The Astra DB connection namespace.",
+        deprecated="Please use 'keyspace' instead.",
+    )
+    batch_size: int = Field(default=20, description="Number of records per batch")
+class AstraDBDownloaderConfig(DownloaderConfig):
+    fields: list[str] = field(default_factory=list)
+class AstraDBUploaderConfig(UploaderConfig):
+    collection_name: str = Field(
+        description="The name of the Astra DB collection. "
+        "Note that the collection name must only include letters, "
+        "numbers, and underscores."
+    )
+    embedding_dimension: int = Field(
+        default=384, description="The dimensionality of the embeddings"
+    )
+    keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
+    namespace: Optional[str] = Field(
+        default=None,
+        description="The Astra DB connection namespace.",
+        deprecated="Please use 'keyspace' instead.",
+    )
+    requested_indexing_policy: Optional[dict[str, Any]] = Field(
+        default=None,
+        description="The indexing policy to use for the collection.",
+        examples=['{"deny": ["metadata"]}'],
+    )
+    batch_size: int = Field(default=20, description="Number of records per batch")
+@dataclass
+class AstraDBIndexer(Indexer):
+    connection_config: AstraDBConnectionConfig
+    index_config: AstraDBIndexerConfig
+    def get_collection(self) -> "AstraDBCollection":
+        return get_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=self.index_config.collection_name,
+            keyspace=self.index_config.keyspace or self.index_config.namespace,
+        )
+    def precheck(self) -> None:
+        try:
+            self.get_collection()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def _get_doc_ids(self) -> set[str]:
+        """Fetches all document ids in an index"""
+        # Initialize set of ids
+        ids = set()
+        # Get the collection
+        collection = self.get_collection()
+        # Perform the find operation to get all items
+        astra_db_docs_cursor = collection.find({}, projection={"_id": True})
+        # Iterate over the cursor
+        astra_db_docs = []
+        for result in astra_db_docs_cursor:
+            astra_db_docs.append(result)
+        # Create file data for each astra record
+        for astra_record in astra_db_docs:
+            ids.add(astra_record["_id"])
+        return ids
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        all_ids = self._get_doc_ids()
+        ids = list(all_ids)
+        id_batches = batch_generator(ids, self.index_config.batch_size)
+        for batch in id_batches:
+            # Make sure the hash is always a positive number to create identified
+            identified = str(hash(batch) + sys.maxsize + 1)
+            fd = FileData(
+                identifier=identified,
+                connector_type=CONNECTOR_TYPE,
+                doc_type="batch",
+                metadata=FileDataSourceMetadata(
+                    date_processed=str(time()),
+                ),
+                additional_metadata={
+                    "ids": list(batch),
+                    "collection_name": self.index_config.collection_name,
+                    "keyspace": self.index_config.keyspace or self.index_config.namespace,
+                },
+            )
+            yield fd
+@dataclass
+class AstraDBDownloader(Downloader):
+    connection_config: AstraDBConnectionConfig
+    download_config: AstraDBDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def is_async(self) -> bool:
+        return True
+    def get_identifier(self, record_id: str) -> str:
+        f = f"{record_id}"
+        if self.download_config.fields:
+            f = "{}-{}".format(
+                f,
+                hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
+            )
+        return f
+    def write_astra_result_to_csv(self, astra_result: dict, download_path: str) -> None:
+        with open(download_path, "w", encoding="utf8") as f:
+            writer = csv.writer(f)
+            writer.writerow(astra_result.keys())
+            writer.writerow(astra_result.values())
+    def generate_download_response(self, result: dict, file_data: FileData) -> DownloadResponse:
+        record_id = result["_id"]
+        filename_id = self.get_identifier(record_id=record_id)
+        filename = f"{filename_id}.csv"  # csv to preserve column info
+        download_path = self.download_dir / Path(filename)
+        logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            self.write_astra_result_to_csv(astra_result=result, download_path=download_path)
+        except Exception as e:
+            logger.error(
+                f"failed to download from record {record_id} to {download_path}: {e}",
+                exc_info=True,
+            )
+            raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        # modify input file_data for download_response
+        copied_file_data = copy.deepcopy(file_data)
+        copied_file_data.identifier = filename
+        copied_file_data.doc_type = "file"
+        copied_file_data.metadata.date_processed = str(time())
+        copied_file_data.metadata.record_locator = {"document_id": record_id}
+        copied_file_data.additional_metadata.pop("ids", None)
+        return super().generate_download_response(
+            file_data=copied_file_data, download_path=download_path
+        )
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        raise NotImplementedError("Use astradb run_async instead")
+    async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        # Get metadata from file_data
+        ids: list[str] = file_data.additional_metadata["ids"]
+        collection_name: str = file_data.additional_metadata["collection_name"]
+        keyspace: str = file_data.additional_metadata["keyspace"]
+        # Retrieve results from async collection
+        download_responses = []
+        async_astra_collection = await get_async_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=collection_name,
+            keyspace=keyspace,
+        )
+        async for result in async_astra_collection.find({"_id": {"$in": ids}}):
+            download_responses.append(
+                self.generate_download_response(result=result, file_data=file_data)
+            )
+        return download_responses
 @dataclass
 class AstraDBUploadStager(UploadStager):
     upload_stager_config: AstraDBUploadStagerConfig = field(
@@ -77,29 +335,6 @@ class AstraDBUploadStager(UploadStager):
         return output_path
-class AstraDBUploaderConfig(UploaderConfig):
-    collection_name: str = Field(
-        description="The name of the Astra DB collection. "
-        "Note that the collection name must only include letters, "
-        "numbers, and underscores."
-    )
-    embedding_dimension: int = Field(
-        default=384, description="The dimensionality of the embeddings"
-    )
-    keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
-    namespace: Optional[str] = Field(
-        default=None,
-        description="The Astra DB connection namespace.",
-        deprecated="Please use 'keyspace' instead.",
-    )
-    requested_indexing_policy: Optional[dict[str, Any]] = Field(
-        default=None,
-        description="The indexing policy to use for the collection.",
-        examples=['{"deny": ["metadata"]}'],
-    )
-    batch_size: int = Field(default=20, description="Number of records per batch")
 @dataclass
 class AstraDBUploader(Uploader):
     connection_config: AstraDBConnectionConfig
@@ -108,43 +343,23 @@ class AstraDBUploader(Uploader):
     def precheck(self) -> None:
         try:
-            self.get_collection()
+            get_astra_collection(
+                connection_config=self.connection_config,
+                collection_name=self.upload_config.collection_name,
+                keyspace=self.upload_config.keyspace or self.upload_config.namespace,
+            )
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
     @requires_dependencies(["astrapy"], extras="astradb")
     def get_collection(self) -> "AstraDBCollection":
-        from astrapy import DataAPIClient as AstraDBClient
-        # Choose keyspace or deprecated namespace
-        keyspace_param = self.upload_config.keyspace or self.upload_config.namespace
-        # Get the collection_name
-        collection_name = self.upload_config.collection_name
-        # Build the Astra DB object.
-        access_configs = self.connection_config.access_config.get_secret_value()
-        # Create a client object to interact with the Astra DB
-        # caller_name/version for Astra DB tracking
-        my_client = AstraDBClient(
-            caller_name=integration_name,
-            caller_version=integration_version,
-        )
-        # Get the database object
-        astra_db = my_client.get_database(
-            api_endpoint=access_configs.api_endpoint,
-            token=access_configs.token,
-            keyspace=keyspace_param,
+        return get_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=self.upload_config.collection_name,
+            keyspace=self.upload_config.keyspace or self.upload_config.namespace,
         )
-        # Connect to the newly created collection
-        astra_db_collection = astra_db.get_collection(name=collection_name)
-        return astra_db_collection
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         with path.open("r") as file:
             elements_dict = json.load(file)
@@ -160,6 +375,14 @@ class AstraDBUploader(Uploader):
             collection.insert_many(chunk)
+astra_db_source_entry = SourceRegistryEntry(
+    indexer=AstraDBIndexer,
+    indexer_config=AstraDBIndexerConfig,
+    downloader=AstraDBDownloader,
+    downloader_config=AstraDBDownloaderConfig,
+    connection_config=AstraDBConnectionConfig,
+)
 astra_db_destination_entry = DestinationRegistryEntry(
     connection_config=AstraDBConnectionConfig,
     upload_stager_config=AstraDBUploadStagerConfig,

unstructured_ingest/v2/processes/connectors/confluence.py ADDED Viewed

@@ -0,0 +1,195 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Generator, List, Optional
+from pydantic import Field, Secret
+from unstructured_ingest.error import SourceConnectionError
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    FileData,
+    FileDataSourceMetadata,
+    Indexer,
+    IndexerConfig,
+    SourceIdentifiers,
+    download_responses,
+)
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connector_registry import (
+    SourceRegistryEntry,
+)
+if TYPE_CHECKING:
+    from atlassian import Confluence
+CONNECTOR_TYPE = "confluence"
+class ConfluenceAccessConfig(AccessConfig):
+    api_token: str = Field(description="Confluence API token")
+class ConfluenceConnectionConfig(ConnectionConfig):
+    url: str = Field(description="URL of the Confluence instance")
+    user_email: str = Field(description="User email for authentication")
+    access_config: Secret[ConfluenceAccessConfig] = Field(
+        description="Access configuration for Confluence"
+    )
+    @requires_dependencies(["atlassian"], extras="confluence")
+    def get_client(self) -> "Confluence":
+        from atlassian import Confluence
+        access_configs = self.access_config.get_secret_value()
+        return Confluence(
+            url=self.url,
+            username=self.user_email,
+            password=access_configs.api_token,
+        )
+class ConfluenceIndexerConfig(IndexerConfig):
+    max_num_of_spaces: int = Field(500, description="Maximum number of spaces to index")
+    max_num_of_docs_from_each_space: int = Field(
+        100, description="Maximum number of documents to fetch from each space"
+    )
+    spaces: Optional[List[str]] = Field(None, description="List of specific space keys to index")
+@dataclass
+class ConfluenceIndexer(Indexer):
+    connection_config: ConfluenceConnectionConfig
+    index_config: ConfluenceIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> bool:
+        try:
+            # Attempt to retrieve a list of spaces with limit=1.
+            # This should only succeed if all creds are valid
+            client = self.connection_config.get_client()
+            client.get_all_spaces(limit=1)
+            logger.info("Connection to Confluence successful.")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
+            raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
+    def _get_space_ids(self) -> List[str]:
+        spaces = self.index_config.spaces
+        if spaces:
+            return spaces
+        else:
+            client = self.connection_config.get_client()
+            all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
+            space_ids = [space["key"] for space in all_spaces["results"]]
+            return space_ids
+    def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
+        client = self.connection_config.get_client()
+        pages = client.get_all_pages_from_space(
+            space=space_id,
+            start=0,
+            limit=self.index_config.max_num_of_docs_from_each_space,
+            expand=None,
+            content_type="page",
+            status=None,
+        )
+        doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
+        return doc_ids
+    def run(self) -> Generator[FileData, None, None]:
+        from time import time
+        space_ids = self._get_space_ids()
+        for space_id in space_ids:
+            doc_ids = self._get_docs_ids_within_one_space(space_id)
+            for doc in doc_ids:
+                doc_id = doc["doc_id"]
+                # Build metadata
+                metadata = FileDataSourceMetadata(
+                    date_processed=str(time()),
+                    url=f"{self.connection_config.url}/pages/{doc_id}",
+                    record_locator={
+                        "space_id": space_id,
+                        "document_id": doc_id,
+                    },
+                )
+                additional_metadata = {
+                    "space_id": space_id,
+                    "document_id": doc_id,
+                }
+                # Construct relative path and filename
+                filename = f"{doc_id}.html"
+                relative_path = str(Path(space_id) / filename)
+                source_identifiers = SourceIdentifiers(
+                    filename=filename,
+                    fullpath=relative_path,
+                    rel_path=relative_path,
+                )
+                file_data = FileData(
+                    identifier=doc_id,
+                    connector_type=self.connector_type,
+                    metadata=metadata,
+                    additional_metadata=additional_metadata,
+                    source_identifiers=source_identifiers,
+                )
+                yield file_data
+class ConfluenceDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class ConfluenceDownloader(Downloader):
+    connection_config: ConfluenceConnectionConfig
+    download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
+    connector_type: str = CONNECTOR_TYPE
+    def run(self, file_data: FileData, **kwargs) -> download_responses:
+        doc_id = file_data.identifier
+        try:
+            client = self.connection_config.get_client()
+            page = client.get_page_by_id(
+                page_id=doc_id,
+                expand="history.lastUpdated,version,body.view",
+            )
+        except Exception as e:
+            logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
+            raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
+        if not page:
+            raise ValueError(f"Page with ID {doc_id} does not exist.")
+        content = page["body"]["view"]["value"]
+        filepath = file_data.source_identifiers.relative_path
+        download_path = Path(self.download_dir) / filepath
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(download_path, "w", encoding="utf8") as f:
+            f.write(content)
+        # Update file_data with metadata
+        file_data.metadata.date_created = page["history"]["createdDate"]
+        file_data.metadata.date_modified = page["version"]["when"]
+        file_data.metadata.version = str(page["version"]["number"])
+        file_data.display_name = page["title"]
+        return self.generate_download_response(file_data=file_data, download_path=download_path)
+confluence_source_entry = SourceRegistryEntry(
+    connection_config=ConfluenceConnectionConfig,
+    indexer_config=ConfluenceIndexerConfig,
+    indexer=ConfluenceIndexer,
+    downloader_config=ConfluenceDownloaderConfig,
+    downloader=ConfluenceDownloader,
+)

unstructured_ingest/v2/processes/connectors/databricks/volumes.py CHANGED Viewed

@@ -148,9 +148,7 @@ class DatabricksVolumesDownloader(Downloader, ABC):
 class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
-    overwrite: bool = Field(
-        default=False, description="If true, an existing file will be overwritten."
-    )
+    pass
 @dataclass
@@ -166,10 +164,12 @@ class DatabricksVolumesUploader(Uploader, ABC):
             raise DestinationConnectionError(f"failed to validate connection: {e}")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        output_path = os.path.join(self.upload_config.path, file_data.source_identifiers.filename)
+        output_path = os.path.join(
+            self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
+        )
         with open(path, "rb") as elements_file:
             self.connection_config.get_client().files.upload(
                 file_path=output_path,
                 contents=elements_file,
-                overwrite=self.upload_config.overwrite,
+                overwrite=True,
             )

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -176,6 +176,7 @@ class FsspecIndexer(Indexer):
                 ),
                 metadata=self.get_metadata(file_data=file_data),
                 additional_metadata=additional_metadata,
+                display_name=file_path,
             )
@@ -230,9 +231,7 @@ class FsspecDownloader(Downloader):
 class FsspecUploaderConfig(FileConfig, UploaderConfig):
-    overwrite: bool = Field(
-        default=False, description="If true, an existing file will be overwritten."
-    )
+    pass
 FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
@@ -287,9 +286,6 @@ class FsspecUploader(Uploader):
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         path_str = str(path.resolve())
         upload_path = self.get_upload_path(file_data=file_data)
-        if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
-            logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
-            return
         logger.debug(f"writing local file {path_str} to {upload_path}")
         self.fs.upload(lpath=path_str, rpath=str(upload_path))
@@ -297,9 +293,5 @@ class FsspecUploader(Uploader):
         upload_path = self.get_upload_path(file_data=file_data)
         path_str = str(path.resolve())
         # Odd that fsspec doesn't run exists() as async even when client support async
-        already_exists = self.fs.exists(path=str(upload_path))
-        if already_exists and not self.upload_config.overwrite:
-            logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
-            return
         logger.debug(f"writing local file {path_str} to {upload_path}")
         self.fs.upload(lpath=path_str, rpath=str(upload_path))

unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl