PyPI - ragbits-document-search - Versions diffs - 1.4.0.dev202601310254__py3-none-any.whl - Mend

ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

ragbits/document_search/ingestion/parsers/docling.py ADDED Viewed

@@ -0,0 +1,178 @@
+from docling.chunking import HierarchicalChunker
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import AcceleratorOptions, EasyOcrOptions, PdfPipelineOptions, PipelineOptions
+from docling.document_converter import (
+    DocumentConverter,
+    ExcelFormatOption,
+    FormatOption,
+    HTMLFormatOption,
+    MarkdownFormatOption,
+    PdfFormatOption,
+    PowerpointFormatOption,
+    WordFormatOption,
+)
+from docling_core.transforms.chunker.base import BaseChunker
+from docling_core.types.doc import DocItem, DoclingDocument
+from ragbits.document_search.documents.document import Document, DocumentType
+from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
+from ragbits.document_search.ingestion.parsers import DocumentParser
+class DoclingDocumentParser(DocumentParser):
+    """
+    Parser that uses the Docling to process the documents.
+    """
+    supported_document_types = {
+        DocumentType.DOCX,
+        DocumentType.PPTX,
+        DocumentType.XLSX,
+        DocumentType.MD,
+        DocumentType.PNG,
+        DocumentType.JPG,
+        DocumentType.HTML,
+        DocumentType.TXT,
+        DocumentType.PDF,
+    }
+    def __init__(
+        self,
+        ignore_images: bool = False,
+        num_threads: int = 1,
+        chunker: BaseChunker | None = None,
+        format_options: dict[InputFormat, FormatOption] | None = None,
+    ) -> None:
+        """
+        Initialize the DoclingDocumentParser instance.
+        Args:
+            ignore_images: If True images will be skipped.
+            num_threads: The number of threads for parsing parallelism on CPU.
+            chunker: Custom chunker instance. If None, HierarchicalChunker will be used.
+            format_options: Full format options configuration for DocumentConverter.
+                If None, default format options will be used.
+        """
+        self.ignore_images = ignore_images
+        self.num_threads = num_threads
+        self.chunker = chunker
+        self.format_options = format_options
+    async def parse(self, document: Document) -> list[Element]:
+        """
+        Parse the document using the Docling API.
+        Args:
+            document: The document to parse.
+        Returns:
+            The list of elements extracted from the document.
+        """
+        self.validate_document_type(document.metadata.document_type)
+        partitioned_document = await self._partition(document)
+        return self._chunk(partitioned_document, document)
+    async def _partition(self, document: Document) -> DoclingDocument:
+        """
+        Partition the document.
+        Args:
+            document: The document to parse.
+        Returns:
+            The docling document.
+        Raises:
+            ConversionError: If converting the document to the Docling format fails.
+        """
+        # Use provided format_options or create default ones
+        if self.format_options is not None:
+            converter = DocumentConverter(format_options=self.format_options)
+        else:
+            # Build default format options
+            accelerator_options = AcceleratorOptions(num_threads=self.num_threads)
+            pipeline_options = PipelineOptions(accelerator_options=accelerator_options)
+            pdf_pipeline_options = PdfPipelineOptions(
+                images_scale=2,
+                generate_page_images=True,
+                accelerator_options=accelerator_options,
+                ocr_options=EasyOcrOptions(),
+            )
+            converter = DocumentConverter(
+                format_options={
+                    InputFormat.XLSX: ExcelFormatOption(pipeline_options=pipeline_options),
+                    InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
+                    InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pipeline_options),
+                    InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
+                    InputFormat.MD: MarkdownFormatOption(pipeline_options=pipeline_options),
+                    InputFormat.IMAGE: PdfFormatOption(pipeline_options=pdf_pipeline_options),
+                    InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
+                },
+            )
+        # For txt files, temporarily rename to .md extension. Docling doesn't support text files natively.
+        if document.metadata.document_type == DocumentType.TXT:
+            original_suffix = document.local_path.suffix
+            document.local_path = document.local_path.rename(document.local_path.with_suffix(".md"))
+        partitioned_document = converter.convert(document.local_path).document
+        # Convert back to the original file.
+        if document.metadata.document_type == DocumentType.TXT:
+            document.local_path = document.local_path.rename(document.local_path.with_suffix(original_suffix))
+        return partitioned_document
+    def _chunk(self, partitioned_document: DoclingDocument, document: Document) -> list[Element]:
+        """
+        Chunk the partitioned document.
+        Args:
+            partitioned_document: The partitioned document by Docling.
+            document: The document to parse.
+        Returns:
+            The list of chunked elements.
+        """
+        # Use provided chunker or create default HierarchicalChunker
+        chunker = self.chunker or HierarchicalChunker()
+        text_elements: list[Element] = [
+            TextElement(
+                document_meta=document.metadata,
+                location=self._extract_element_location(chunk.meta.doc_items[0]),  # type: ignore
+                content=chunk.text,
+            )
+            for chunk in chunker.chunk(partitioned_document)
+        ]
+        if self.ignore_images:
+            return text_elements
+        return text_elements + [
+            ImageElement(
+                document_meta=document.metadata,
+                location=self._extract_element_location(element),
+                image_bytes=image_bytes,
+                ocr_extracted_text=element.caption_text(partitioned_document),
+            )
+            for element in partitioned_document.pictures
+            if (image := element.get_image(partitioned_document)) and (image_bytes := image._repr_jpeg_())
+        ]
+    @staticmethod
+    def _extract_element_location(element: DocItem) -> ElementLocation:
+        """
+        Convert docling element to element location.
+        Args:
+            element: The element from docling.
+        Returns:
+            The element location.
+        """
+        metadata = element.prov[0].model_dump() if element.prov else {}
+        return ElementLocation(
+            page_number=metadata.get("page_no"),
+        )

ragbits/document_search/ingestion/parsers/exceptions.py ADDED Viewed

@@ -0,0 +1,32 @@
+from ragbits.document_search.documents.document import DocumentType
+class ParserError(Exception):
+    """
+    Class for all exceptions raised by the document parser and router.
+    """
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+        self.message = message
+class ParserNotFoundError(ParserError):
+    """
+    Raised when no parser was found for the document type.
+    """
+    def __init__(self, document_type: DocumentType) -> None:
+        super().__init__(f"No parser found for the document type {document_type}")
+        self.document_type = document_type
+class ParserDocumentNotSupportedError(ParserError):
+    """
+    Raised when the document type is not supported by the parser.
+    """
+    def __init__(self, parser_name: str, document_type: DocumentType) -> None:
+        super().__init__(f"Document type {document_type.value} is not supported by the {parser_name}")
+        self.parser_name = parser_name
+        self.document_type = document_type

ragbits/document_search/ingestion/parsers/pptx/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+from .callbacks import PptxCallback
+from .exceptions import (
+    PptxExtractionError,
+    PptxParserError,
+    PptxPresentationError,
+)
+from .hyperlink_callback import LinkCallback
+from .metadata_callback import MetaCallback
+from .parser import PptxDocumentParser
+from .speaker_notes_callback import NotesCallback
+DEFAULT_CALLBACKS = [
+    NotesCallback(),
+    LinkCallback(),
+    MetaCallback(),
+]
+__all__ = [
+    "DEFAULT_CALLBACKS",
+    "LinkCallback",
+    "MetaCallback",
+    "NotesCallback",
+    "PptxCallback",
+    "PptxDocumentParser",
+    "PptxExtractionError",
+    "PptxParserError",
+    "PptxPresentationError",
+]

ragbits/document_search/ingestion/parsers/pptx/callbacks.py ADDED Viewed

@@ -0,0 +1,32 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from pathlib import Path
+from docling_core.types.doc import DoclingDocument
+from pptx.presentation import Presentation
+class PptxCallback(ABC):
+    """
+    Abstract base class for PPTX document enhancement callbacks.
+    """
+    name: str
+    @abstractmethod
+    def __call__(
+        self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
+    ) -> DoclingDocument:
+        """
+        Process PPTX presentation and enhance the docling document.
+        Args:
+            pptx_path: Path to the PPTX file.
+            presentation: Loaded PPTX presentation.
+            docling_document: Document to enhance.
+        Returns:
+            Enhanced docling document.
+        """
+        pass

ragbits/document_search/ingestion/parsers/pptx/exceptions.py ADDED Viewed

@@ -0,0 +1,52 @@
+from ragbits.document_search.ingestion.parsers.exceptions import ParserError
+class PptxParserError(ParserError):
+    """
+    Base class for all PPTX parser related exceptions.
+    """
+class PptxExtractionError(PptxParserError):
+    """
+    Raised when an extractor fails to extract content from a shape or slide.
+    """
+    def __init__(self, extractor_name: str, slide_idx: int, shape_info: str, original_error: Exception) -> None:
+        """
+        Initialize the PptxExtractionError.
+        Args:
+            extractor_name: Name of the extractor that failed.
+            slide_idx: Index of the slide where extraction failed.
+            shape_info: Information about the shape that caused the failure.
+            original_error: The original exception that caused the failure.
+        """
+        message = (
+            f"Extractor '{extractor_name}' failed to extract content from slide {slide_idx}. "
+            f"Shape info: {shape_info}. Original error: {original_error}"
+        )
+        super().__init__(message)
+        self.extractor_name = extractor_name
+        self.slide_idx = slide_idx
+        self.shape_info = shape_info
+        self.original_error = original_error
+class PptxPresentationError(PptxParserError):
+    """
+    Raised when the PPTX presentation cannot be loaded or processed.
+    """
+    def __init__(self, file_path: str, original_error: Exception) -> None:
+        """
+        Initialize the PptxPresentationError.
+        Args:
+            file_path: Path to the PPTX file that failed to load.
+            original_error: The original exception that caused the failure.
+        """
+        message = f"Failed to load or process PPTX presentation from '{file_path}'. Original error: {original_error}"
+        super().__init__(message)
+        self.file_path = file_path
+        self.original_error = original_error

ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py ADDED Viewed

@@ -0,0 +1,84 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem
+from pptx.presentation import Presentation
+from pptx.shapes.group import GroupShape
+from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
+from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError
+logger = logging.getLogger(__name__)
+class LinkCallback(PptxCallback):
+    """
+    Callback to extract hyperlinks from PPTX shapes.
+    """
+    name = "link_callback"
+    def __call__(
+        self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
+    ) -> DoclingDocument:
+        """
+        Extract hyperlinks from all shapes and add them to the docling document.
+        Args:
+            pptx_path: Path to the PPTX file.
+            presentation: Loaded PPTX presentation.
+            docling_document: Document to enhance with hyperlinks.
+        Returns:
+            Enhanced docling document with hyperlinks.
+        """
+        hyperlinks_added = 0
+        for slide_idx, slide in enumerate(presentation.slides, start=1):
+            for shape in slide.shapes:
+                try:
+                    hyperlink_address = self._extract_hyperlink_address(shape)
+                    if hyperlink_address:
+                        link_text = f"Link: {hyperlink_address}"
+                        hyperlink_item = TextItem(
+                            self_ref=f"#/links/{slide_idx + hyperlinks_added}",
+                            text=link_text,
+                            orig=link_text,
+                            label=DocItemLabel.TEXT,
+                            prov=[
+                                ProvenanceItem(
+                                    page_no=slide_idx,
+                                    bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0),
+                                    charspan=(0, len(link_text)),
+                                )
+                            ],
+                        )
+                        docling_document.texts.append(hyperlink_item)
+                        hyperlinks_added += 1
+                        logger.debug("Added hyperlink from slide %d: %s", slide_idx, hyperlink_address)
+                except (AttributeError, TypeError) as e:
+                    extraction_error = PptxExtractionError(self.name, slide_idx, "hyperlink from shape", e)
+                    logger.debug(
+                        "Failed to extract hyperlink from shape on slide %d: %s", slide_idx, str(extraction_error)
+                    )
+                    continue
+        if hyperlinks_added > 0:
+            logger.info("Successfully added %d hyperlinks to docling document", hyperlinks_added)
+        else:
+            logger.debug("No hyperlinks found in presentation")
+        return docling_document
+    @staticmethod
+    def _extract_hyperlink_address(shape: object) -> str | None:
+        if not hasattr(shape, "click_action") or isinstance(shape, GroupShape):
+            return None
+        if not shape.click_action.hyperlink or not shape.click_action.hyperlink.address:
+            return None
+        return shape.click_action.hyperlink.address

ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem
+from pptx.presentation import Presentation
+from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
+from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError
+logger = logging.getLogger(__name__)
+class MetaCallback(PptxCallback):
+    """
+    Callback to extract presentation metadata from PPTX files.
+    """
+    name = "meta_callback"
+    def __call__(
+        self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
+    ) -> DoclingDocument:
+        """
+        Extract presentation metadata and add it to the docling document.
+        Args:
+            pptx_path: Path to the PPTX file.
+            presentation: Loaded PPTX presentation.
+            docling_document: Document to enhance with metadata.
+        Returns:
+            Enhanced docling document with metadata.
+        """
+        metadata_added = 0
+        try:
+            core_properties = presentation.core_properties
+            properties = [
+                ("author", core_properties.author),
+                ("title", core_properties.title),
+                ("subject", core_properties.subject),
+                ("keywords", core_properties.keywords),
+                ("category", core_properties.category),
+                ("created", str(core_properties.created) if core_properties.created else None),
+                ("modified", str(core_properties.modified) if core_properties.modified else None),
+            ]
+            for prop_name, prop_value in properties:
+                if prop_value is not None and str(prop_value).strip():
+                    meta_text = f"{prop_name}: {prop_value}"
+                    metadata_item = TextItem(
+                        self_ref=f"#/metadata/{metadata_added}",
+                        text=meta_text,
+                        orig=meta_text,
+                        label=DocItemLabel.TEXT,
+                        prov=[
+                            ProvenanceItem(
+                                page_no=0, bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0), charspan=(0, len(meta_text))
+                            )
+                        ],
+                    )
+                    docling_document.texts.append(metadata_item)
+                    metadata_added += 1
+                    logger.debug("Added metadata: %s = %s", prop_name, prop_value)
+        except (AttributeError, TypeError) as e:
+            extraction_error = PptxExtractionError(self.name, 0, "presentation metadata", e)
+            logger.debug("Failed to extract presentation metadata: %s", str(extraction_error))
+        if metadata_added > 0:
+            logger.info("Successfully added %d metadata properties to docling document", metadata_added)
+        else:
+            logger.debug("No metadata found in presentation")
+        return docling_document

ragbits/document_search/ingestion/parsers/pptx/parser.py ADDED Viewed

@@ -0,0 +1,85 @@
+from __future__ import annotations
+import logging
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import FormatOption
+from docling_core.transforms.chunker.base import BaseChunker
+from docling_core.types.doc import DoclingDocument
+from pptx import Presentation
+from ragbits.document_search.documents.document import Document, DocumentType
+from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
+from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
+from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError, PptxPresentationError
+logger = logging.getLogger(__name__)
+class PptxDocumentParser(DoclingDocumentParser):
+    """
+    Document parser for PPTX files with callback-based enhancement.
+    """
+    supported_document_types = {DocumentType.PPTX}
+    def __init__(
+        self,
+        ignore_images: bool = False,
+        num_threads: int = 1,
+        chunker: BaseChunker | None = None,
+        format_options: dict[InputFormat, FormatOption] | None = None,
+        pptx_callbacks: list[PptxCallback] | None = None,
+    ) -> None:
+        super().__init__(
+            ignore_images=ignore_images,
+            num_threads=num_threads,
+            chunker=chunker,
+            format_options=format_options,
+        )
+        if pptx_callbacks is None:
+            from ragbits.document_search.ingestion.parsers.pptx import DEFAULT_CALLBACKS
+            self.pptx_callbacks = DEFAULT_CALLBACKS
+        else:
+            self.pptx_callbacks = pptx_callbacks
+        logger.debug("Initialized PptxDocumentParser with %d callbacks", len(self.pptx_callbacks))
+    async def _partition(self, document: Document) -> DoclingDocument:
+        docling_document = await super()._partition(document)
+        if not self.pptx_callbacks:
+            return docling_document
+        logger.info("Enhancing docling document with %d callbacks", len(self.pptx_callbacks))
+        try:
+            presentation = Presentation(document.local_path.as_posix())
+        except Exception as e:
+            logger.error("Failed to load presentation for callbacks: %s", str(e))
+            raise PptxPresentationError(str(document.local_path), e) from e
+        successful_callbacks = 0
+        for callback in self.pptx_callbacks:
+            try:
+                logger.debug("Running callback: %s", callback.name)
+                docling_document = callback(document.local_path, presentation, docling_document)
+                successful_callbacks += 1
+                logger.debug("Successfully applied callback: %s", callback.name)
+            except Exception as e:
+                extraction_error = PptxExtractionError(callback.name, -1, "callback execution", e)
+                logger.error(
+                    "Callback %s failed: %s. Continuing with other callbacks.",
+                    callback.name,
+                    str(extraction_error),
+                    exc_info=True,
+                )
+        logger.info(
+            "Enhanced docling document with %d/%d successful callbacks",
+            successful_callbacks,
+            len(self.pptx_callbacks),
+        )
+        return docling_document

ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py ADDED Viewed

@@ -0,0 +1,75 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem
+from pptx.presentation import Presentation
+from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
+from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError
+logger = logging.getLogger(__name__)
+class NotesCallback(PptxCallback):
+    """
+    Callback to extract speaker notes from PPTX slides.
+    """
+    name = "notes_callback"
+    def __call__(
+        self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
+    ) -> DoclingDocument:
+        """
+        Extract speaker notes from all slides and add them to the docling document.
+        Args:
+            pptx_path: Path to the PPTX file.
+            presentation: Loaded PPTX presentation.
+            docling_document: Document to enhance with speaker notes.
+        Returns:
+            Enhanced docling document with speaker notes.
+        """
+        notes_added = 0
+        for slide_idx, slide in enumerate(presentation.slides, start=1):
+            try:
+                if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
+                    notes_text_frame = slide.notes_slide.notes_text_frame
+                    text = getattr(notes_text_frame, "text", None)
+                    text = text.strip() if text else None
+                    if text:
+                        notes_item = TextItem(
+                            self_ref=f"#/notes/{slide_idx}",
+                            text=text,
+                            orig=text,
+                            label=DocItemLabel.TEXT,
+                            prov=[
+                                ProvenanceItem(
+                                    page_no=slide_idx,
+                                    bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0),
+                                    charspan=(0, len(text)),
+                                )
+                            ],
+                        )
+                        docling_document.texts.append(notes_item)
+                        notes_added += 1
+                        logger.debug("Added speaker notes from slide %d", slide_idx)
+            except (AttributeError, TypeError) as e:
+                extraction_error = PptxExtractionError(self.name, slide_idx, "speaker notes", e)
+                logger.debug("Failed to extract speaker notes from slide %d: %s", slide_idx, str(extraction_error))
+                continue
+        if notes_added > 0:
+            logger.info("Successfully added %d speaker notes to docling document", notes_added)
+        else:
+            logger.debug("No speaker notes found in presentation")
+        return docling_document