PyPI - ragbits-document-search - Versions diffs - 1.4.0.dev202601310254__py3-none-any.whl - Mend

ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

ragbits/document_search/ingestion/parsers/router.py ADDED Viewed

@@ -0,0 +1,90 @@
+from collections.abc import Mapping
+from typing import ClassVar
+from typing_extensions import Self
+from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
+from ragbits.document_search.documents.document import DocumentType
+from ragbits.document_search.ingestion.parsers.base import DocumentParser
+from ragbits.document_search.ingestion.parsers.exceptions import ParserNotFoundError
+class DocumentParserRouter(WithConstructionConfig):
+    """
+    The class responsible for routing the document to the correct parser based on the document type.
+    """
+    configuration_key: ClassVar[str] = "parser_router"
+    _parsers: Mapping[DocumentType, DocumentParser]
+    def __init__(self, parsers: Mapping[DocumentType, DocumentParser] | None = None) -> None:
+        """
+        Initialize the DocumentParserRouter instance.
+        Args:
+            parsers: The mapping of document types and their parsers. To override default Unstructured parsers.
+        """
+        self._parsers = {**self._get_default_parsers(), **parsers} if parsers else self._get_default_parsers()
+    @classmethod
+    def from_config(cls, config: dict[str, ObjectConstructionConfig]) -> Self:
+        """
+        Initialize the class with the provided configuration.
+        Args:
+            config: A dictionary containing configuration details for the class.
+        Returns:
+            The DocumentParserRouter.
+        Raises:
+            InvalidConfigError: If any of the provided parsers cannot be initialized.
+        """
+        parsers = {
+            DocumentType(document_type): DocumentParser.subclass_from_config(parser_config)
+            for document_type, parser_config in config.items()
+        }
+        return super().from_config({"parsers": parsers})
+    def get(self, document_type: DocumentType) -> DocumentParser:
+        """
+        Get the parser for the document.
+        Args:
+            document_type: The document type.
+        Returns:
+            The parser for processing the document.
+        Raises:
+            ParserNotFoundError: If no parser is found for the document type.
+        """
+        parser = self._parsers.get(document_type)
+        if isinstance(parser, DocumentParser):
+            return parser
+        raise ParserNotFoundError(document_type)
+    @staticmethod
+    def _get_default_parsers() -> dict[DocumentType, DocumentParser]:
+        """
+        Get the default parsers.
+        """
+        from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
+        from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser
+        _default_parser = DoclingDocumentParser()
+        return {
+            DocumentType.TXT: _default_parser,
+            DocumentType.MD: _default_parser,
+            DocumentType.PDF: _default_parser,
+            DocumentType.DOCX: _default_parser,
+            DocumentType.PPTX: PptxDocumentParser(),
+            DocumentType.XLSX: _default_parser,
+            DocumentType.HTML: _default_parser,
+            DocumentType.JPG: _default_parser,
+            DocumentType.PNG: _default_parser,
+        }

ragbits/document_search/ingestion/parsers/unstructured.py ADDED Viewed

@@ -0,0 +1,248 @@
+import base64
+import inspect
+import os
+from io import BytesIO
+from PIL import Image
+from typing_extensions import Self
+try:
+    from unstructured import utils
+finally:
+    # Unstructured does super slow call to scarf analytics, including checking nvidia-smi,
+    # which adds couple of seconds of importing time.
+    # This is a hack to disable it.
+    utils.scarf_analytics = lambda *args: True
+from unstructured.chunking.basic import chunk_elements
+from unstructured.documents.elements import Element as UnstructuredElement
+from unstructured.documents.elements import ElementType
+from unstructured.partition.auto import partition
+from unstructured.staging.base import elements_from_dicts
+from unstructured_client import UnstructuredClient
+from unstructured_client.models.operations import PartitionRequestTypedDict
+from unstructured_client.models.shared import FilesTypedDict, PartitionParametersTypedDict, Strategy
+from ragbits.core.audit.traces import traceable
+from ragbits.document_search.documents.document import Document, DocumentType
+from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
+from ragbits.document_search.ingestion.parsers.base import DocumentParser
+UNSTRUCTURED_API_KEY_ENV = "UNSTRUCTURED_API_KEY"
+UNSTRUCTURED_SERVER_URL_ENV = "UNSTRUCTURED_SERVER_URL"
+class UnstructuredDocumentParser(DocumentParser):
+    """
+    Parser that uses the Unstructured API or local SDK to process the documents.
+    """
+    supported_document_types = {
+        DocumentType.TXT,
+        DocumentType.MD,
+        DocumentType.PDF,
+        DocumentType.DOCX,
+        DocumentType.DOC,
+        DocumentType.PPTX,
+        DocumentType.PPT,
+        DocumentType.XLSX,
+        DocumentType.XLS,
+        DocumentType.CSV,
+        DocumentType.HTML,
+        DocumentType.EPUB,
+        DocumentType.ORG,
+        DocumentType.ODT,
+        DocumentType.RST,
+        DocumentType.RTF,
+        DocumentType.TSV,
+        DocumentType.JSON,
+        DocumentType.XML,
+        DocumentType.JPG,
+        DocumentType.PNG,
+    }
+    def __init__(
+        self,
+        partition_kwargs: dict | None = None,
+        chunking_kwargs: dict | None = None,
+        api_key: str | None = None,
+        api_server: str | None = None,
+        use_api: bool = False,
+        ignore_images: bool = False,
+    ) -> None:
+        """
+        Initialize the UnstructuredDocumentParser instance.
+        Args:
+            partition_kwargs: The additional arguments for the partitioning. Refer to the Unstructured API documentation
+                for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters
+            chunking_kwargs: The additional arguments for the chunking.
+            api_key: The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment
+                variable will be used.
+            api_server: The API server URL to use for the Unstructured API. If not specified, the
+                UNSTRUCTURED_SERVER_URL environment variable will be used.
+            use_api: whether to use Unstructured API, otherwise use local version of Unstructured library
+            ignore_images: if True images will be skipped
+        """
+        self.partition_kwargs = partition_kwargs or {}
+        self.chunking_kwargs = chunking_kwargs or {}
+        self.api_key = api_key or os.getenv(UNSTRUCTURED_API_KEY_ENV)
+        self.api_server = api_server or os.getenv(UNSTRUCTURED_SERVER_URL_ENV)
+        self.use_api = use_api
+        self.ignore_images = ignore_images
+        self._client = UnstructuredClient(api_key_auth=self.api_key, server_url=self.api_server)
+    def __reduce__(self) -> tuple[type[Self], tuple]:
+        """
+        Enables the UnstructuredDocumentParser to be pickled and unpickled.
+        Returns:
+            The tuple of class and its arguments that allows object reconstruction.
+        """
+        return self.__class__, tuple(
+            self.__getattribute__(param_name)
+            for param_name in list(inspect.signature(self.__class__.__init__).parameters)[1:]
+        )
+    @traceable
+    async def parse(self, document: Document) -> list[Element]:
+        """
+        Parse the document using the Unstructured API.
+        Args:
+            document: The document to parse.
+        Returns:
+            The list of elements extracted from the document.
+        Raises:
+            ParserDocumentNotSupportedError: If the document type is not supported by the parser.
+        """
+        self.validate_document_type(document.metadata.document_type)
+        elements = await self._partition(document)
+        return self._chunk(elements, document)
+    async def _partition(self, document: Document) -> list[UnstructuredElement]:
+        """
+        Partition the document.
+        Args:
+            document: The document to parse.
+        Returns:
+            The list of extracted elements.
+        """
+        if self.use_api:
+            request = PartitionRequestTypedDict(
+                partition_parameters=PartitionParametersTypedDict(
+                    files=FilesTypedDict(
+                        content=document.local_path.read_bytes(),
+                        file_name=document.local_path.name,
+                    ),
+                    coordinates=True,
+                    strategy=Strategy.HI_RES,
+                    languages=["eng"],
+                    extract_image_block_types=["Image", "Table"],
+                    split_pdf_allow_failed=True,
+                    split_pdf_concurrency_level=15,
+                    split_pdf_page=True,
+                    include_orig_elements=True,
+                ),
+            )
+            request["partition_parameters"].update(**self.partition_kwargs)  # type: ignore
+            response = await self._client.general.partition_async(request=request)
+            return elements_from_dicts(response.elements) if response.elements else []
+        return partition(
+            filename=str(document.local_path),
+            metadata_filename=document.local_path.name,
+            extract_image_block_types=["Image", "Table"],
+            extract_image_block_to_payload=True,
+            include_orig_elements=True,
+            **self.partition_kwargs,
+        )
+    def _chunk(self, elements: list[UnstructuredElement], document: Document) -> list[Element]:
+        """
+        Chunk the list of elements.
+        Args:
+            elements: The list of unstructured elements.
+            document: The document to parse.
+        Returns:
+            The list of chunked elements.
+        """
+        nonimage_elements = [element for element in elements if element.category != ElementType.IMAGE]
+        text_elements: list[Element] = [
+            TextElement(
+                document_meta=document.metadata,
+                location=self._extract_element_location(element),
+                content=element.text,
+            )
+            for element in chunk_elements(nonimage_elements, **self.chunking_kwargs)
+        ]
+        if self.ignore_images:
+            return text_elements
+        return text_elements + [
+            ImageElement(
+                document_meta=document.metadata,
+                location=self._extract_element_location(element),
+                image_bytes=self._extract_image_element_bytes(element, document),
+                ocr_extracted_text=element.text,
+            )
+            for element in elements
+            if element.category == ElementType.IMAGE
+        ]
+    @staticmethod
+    def _extract_element_location(element: UnstructuredElement) -> ElementLocation:
+        """
+        Convert unstructured element to element location.
+        Args:
+            element: The element from unstructured.
+        Returns:
+            The element location.
+        """
+        metadata = element.metadata.to_dict()
+        return ElementLocation(
+            page_number=metadata.get("page_number"),
+            coordinates=metadata.get("coordinates"),
+        )
+    @staticmethod
+    def _extract_image_element_bytes(element: UnstructuredElement, document: Document) -> bytes:
+        """
+        Extract image data using alternative methods when element.metadata.image_base64 is empty.
+        This handles cases where the Unstructured doesn't properly extract image data,
+        requiring additional processing.
+        Args:
+            element: The Unstructured image element.
+            document: The Document to parse.
+        Return:
+            The raw image data.
+        """
+        if element.metadata.image_base64:
+            return base64.b64decode(element.metadata.image_base64)
+        if element.metadata.coordinates and element.metadata.coordinates.points:
+            buffered = BytesIO()
+            Image.open(document.local_path).convert("RGB").crop(
+                (
+                    min(element.metadata.coordinates.points[0][0], element.metadata.coordinates.points[1][0]),
+                    min(element.metadata.coordinates.points[0][1], element.metadata.coordinates.points[3][1]),
+                    max(element.metadata.coordinates.points[2][0], element.metadata.coordinates.points[3][0]),
+                    max(element.metadata.coordinates.points[1][1], element.metadata.coordinates.points[2][1]),
+                )
+            ).save(buffered, format="JPEG")
+            return buffered.getvalue()
+        return b""

ragbits/document_search/ingestion/strategies/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from ragbits.document_search.ingestion.strategies.base import IngestStrategy
+from ragbits.document_search.ingestion.strategies.batched import BatchedIngestStrategy
+from ragbits.document_search.ingestion.strategies.ray import RayDistributedIngestStrategy
+from ragbits.document_search.ingestion.strategies.sequential import SequentialIngestStrategy
+__all__ = ["BatchedIngestStrategy", "IngestStrategy", "RayDistributedIngestStrategy", "SequentialIngestStrategy"]

ragbits/document_search/ingestion/strategies/base.py ADDED Viewed

@@ -0,0 +1,290 @@
+import asyncio
+import logging
+import random
+import traceback
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from collections.abc import Awaitable, Callable, Iterable
+from dataclasses import dataclass, field
+from types import ModuleType
+from typing import ClassVar, ParamSpec, TypeVar
+from ragbits.core.sources.base import Source
+from ragbits.core.utils.config_handling import WithConstructionConfig
+from ragbits.core.vector_stores.base import VectorStore
+from ragbits.document_search.documents.document import Document, DocumentMeta
+from ragbits.document_search.documents.element import Element
+from ragbits.document_search.ingestion import strategies
+from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
+from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
+logger = logging.getLogger(__name__)
+_CallP = ParamSpec("_CallP")
+_CallReturnT = TypeVar("_CallReturnT")
+@dataclass
+class IngestError:
+    """
+    Represents an error that occurred during the document ingest execution
+    """
+    type: type[Exception]
+    message: str
+    stacktrace: str
+    @classmethod
+    def from_exception(cls, exc: Exception) -> "IngestError":
+        """
+        Create an IngestError from an exception.
+        Args:
+            exc: The exception to create the IngestError from.
+        Returns:
+            The IngestError instance.
+        """
+        stacktrace = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
+        return cls(type=type(exc), message=str(exc), stacktrace=stacktrace)
+@dataclass
+class IngestDocumentResult:
+    """
+    Represents the result of the document ingest execution.
+    """
+    document_uri: str
+    num_elements: int = 0
+    error: IngestError | None = None
+@dataclass
+class IngestExecutionResult:
+    """
+    Represents the result of the documents ingest execution.
+    """
+    successful: list[IngestDocumentResult] = field(default_factory=list)
+    failed: list[IngestDocumentResult] = field(default_factory=list)
+class IngestExecutionError(Exception):
+    """
+    Represents an error that occurred during the documents ingest execution.
+    """
+    def __init__(self, results: list[IngestDocumentResult]) -> None:
+        self.results = results
+class IngestStrategy(WithConstructionConfig, ABC):
+    """
+    Base class for ingest strategies, responsible for orchiesting the tasks required to index the document.
+    """
+    default_module: ClassVar[ModuleType | None] = strategies
+    configuration_key: ClassVar[str] = "ingest_strategy"
+    def __init__(self, num_retries: int = 3, backoff_multiplier: int = 1, backoff_max: int = 60) -> None:
+        """
+        Initialize the IngestStrategy instance.
+        Args:
+            num_retries: The number of retries per document ingest task error.
+            backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
+            backoff_max: The maximum allowed delay (in seconds) between retries.
+        """
+        self.num_retries = num_retries
+        self.backoff_multiplier = backoff_multiplier
+        self.backoff_max = backoff_max
+    @abstractmethod
+    async def __call__(
+        self,
+        documents: Iterable[DocumentMeta | Document | Source],
+        vector_store: VectorStore,
+        parser_router: DocumentParserRouter,
+        enricher_router: ElementEnricherRouter,
+    ) -> IngestExecutionResult:
+        """
+        Ingest documents.
+        Args:
+            documents: The documents to ingest.
+            vector_store: The vector store to store document chunks.
+            parser_router: The document parser router to use.
+            enricher_router: The intermediate element enricher router to use.
+        Returns:
+            The ingest execution result.
+        """
+    async def _call_with_error_handling(
+        self,
+        executable: Callable[_CallP, Awaitable[_CallReturnT]],
+        *executable_args: _CallP.args,
+        **executable_kwargs: _CallP.kwargs,
+    ) -> _CallReturnT:
+        """
+        Call executable with a standarized error handling.
+        If an error occurs, the executable is retried `num_retries` times using randomized exponential backoff.
+        Args:
+            executable: The callable function to execute.
+            executable_args: Positional arguments to pass to the executable.
+            executable_kwargs: Keyword arguments to pass to the executable.
+        Returns:
+            The result of the executable if successful.
+        Raises:
+            Exception: The last encountered exception after all retries are exhausted.
+        """
+        for i in range(max(0, self.num_retries) + 1):
+            try:
+                return await executable(*executable_args, **executable_kwargs)
+            except Exception as exc:
+                if i == self.num_retries:
+                    raise exc
+                delay = min(2**i * self.backoff_multiplier, self.backoff_max)
+                delay = random.uniform(0, delay) if delay < self.backoff_max else random.uniform(0, self.backoff_max)  # noqa S311
+                await asyncio.sleep(delay)
+        raise RuntimeError("Unreachable code reached")  # mypy quirk
+    @staticmethod
+    async def _parse_document(
+        document: DocumentMeta | Document | Source,
+        parser_router: DocumentParserRouter,
+    ) -> list[Element]:
+        """
+        Parse a single document and return the elements.
+        Args:
+            document: The document to parse.
+            parser_router: The document parser router to use.
+        Returns:
+            The list of elements.
+        Raises:
+            ParserError: If the parsing of the document failed.
+            ParserDocumentNotSupportedError: If the document type is not supported.
+            ParserNotFoundError: If no parser is found for the document type.
+            SourceError: If the download of the document failed.
+        """
+        document_meta = (
+            await DocumentMeta.from_source(document)
+            if isinstance(document, Source)
+            else document
+            if isinstance(document, DocumentMeta)
+            else document.metadata
+        )
+        parser = parser_router.get(document_meta.document_type)
+        parser.validate_document_type(document_meta.document_type)
+        document = await document_meta.fetch()
+        return await parser.parse(document)
+    @staticmethod
+    async def _enrich_elements(
+        elements: Iterable[Element],
+        enricher_router: ElementEnricherRouter,
+    ) -> list[Element]:
+        """
+        Enrich elements for a single document.
+        Args:
+            elements: The document elements to enrich.
+            enricher_router: The element enricher router to use.
+        Returns:
+            The list of enriched elements.
+        Raises:
+            EnricherError: If the enrichment of the elements failed.
+            EnricherElementNotSupportedError: If the element type is not supported.
+        """
+        grouped_elements = defaultdict(list)
+        for element in elements:
+            grouped_elements[type(element)].append(element)
+        # Separate elements that have enrichers from those that don't
+        elements_to_enrich = []
+        elements_without_enrichers = []
+        for element_type, elements_of_type in grouped_elements.items():
+            if element_type in enricher_router:
+                enricher = enricher_router.get(element_type)
+                enricher.validate_element_type(element_type)
+                elements_to_enrich.append((element_type, elements_of_type))
+            else:
+                # No enricher found for this element type, keep elements as-is
+                elements_without_enrichers.extend(elements_of_type)
+        # Enrich elements that have enrichers
+        if elements_to_enrich:
+            grouped_enriched_elements = await asyncio.gather(
+                *[
+                    enricher_router.get(element_type).enrich(elements_of_type)
+                    for element_type, elements_of_type in elements_to_enrich
+                ]
+            )
+            enriched_elements = [element for enriched_group in grouped_enriched_elements for element in enriched_group]
+        else:
+            enriched_elements = []
+        # Combine enriched elements with elements that don't need enrichment
+        return enriched_elements + elements_without_enrichers
+    @staticmethod
+    async def _remove_elements(document_ids: list[str], vector_store: VectorStore) -> None:
+        """
+        Remove documents entries from the vector store.
+        Args:
+            document_ids: The list of document ids to remove from the vector store.
+            vector_store: The vector store to remove document elements from.
+        """
+        # TODO: Pass 'where' argument to the list method to filter results and optimize search
+        ids_to_delete = [
+            entry.id
+            for entry in await vector_store.list()
+            if entry.metadata.get("document_meta", {}).get("source", {}).get("id") in document_ids
+        ]
+        if ids_to_delete:
+            await vector_store.remove(ids_to_delete)
+    @staticmethod
+    async def _insert_elements(elements: Iterable[Element], vector_store: VectorStore) -> None:
+        """
+        Insert elements into the vector store.
+        Args:
+            elements: The list of elements to insert.
+            vector_store: The vector store to store document chunks.
+        """
+        entries = [element.to_vector_db_entry() for element in elements]
+        # Deduplicate entries by their unique ID to prevent duplicate key errors in the
+        # underlying vector store implementation (many vector stores require IDs to be
+        # unique and will raise an error if duplicates are provided).
+        unique_entries: dict = {}
+        for entry in entries:
+            # If the ID is already present we skip the duplicate and log a warning.
+            # This behaviour ensures idempotency of the ingest operation while
+            # still indexing the first occurrence of every element
+            if entry.id not in unique_entries:
+                unique_entries[entry.id] = entry
+            else:
+                logger.warning(
+                    f"Skipping duplicate entry: {entry.id} from document "
+                    f"{entry.metadata.get('document_meta', {}).get('source', {}).get('id')}"
+                )
+        if unique_entries:
+            await vector_store.store(list(unique_entries.values()))