ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ragbits/document_search/__init__.py +3 -0
  2. ragbits/document_search/_main.py +273 -0
  3. ragbits/document_search/cli.py +109 -0
  4. ragbits/document_search/documents/__init__.py +0 -0
  5. ragbits/document_search/documents/document.py +203 -0
  6. ragbits/document_search/documents/element.py +208 -0
  7. ragbits/document_search/ingestion/__init__.py +0 -0
  8. ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
  9. ragbits/document_search/ingestion/enrichers/base.py +64 -0
  10. ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
  11. ragbits/document_search/ingestion/enrichers/image.py +107 -0
  12. ragbits/document_search/ingestion/enrichers/router.py +86 -0
  13. ragbits/document_search/ingestion/parsers/__init__.py +9 -0
  14. ragbits/document_search/ingestion/parsers/base.py +97 -0
  15. ragbits/document_search/ingestion/parsers/docling.py +178 -0
  16. ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
  17. ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
  18. ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
  19. ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
  20. ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
  21. ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
  22. ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
  23. ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
  24. ragbits/document_search/ingestion/parsers/router.py +90 -0
  25. ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
  26. ragbits/document_search/ingestion/strategies/__init__.py +6 -0
  27. ragbits/document_search/ingestion/strategies/base.py +290 -0
  28. ragbits/document_search/ingestion/strategies/batched.py +261 -0
  29. ragbits/document_search/ingestion/strategies/ray.py +138 -0
  30. ragbits/document_search/ingestion/strategies/sequential.py +23 -0
  31. ragbits/document_search/py.typed +0 -0
  32. ragbits/document_search/retrieval/__init__.py +0 -0
  33. ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
  34. ragbits/document_search/retrieval/rephrasers/base.py +39 -0
  35. ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
  36. ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
  37. ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
  38. ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
  39. ragbits/document_search/retrieval/rerankers/base.py +56 -0
  40. ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
  41. ragbits/document_search/retrieval/rerankers/llm.py +177 -0
  42. ragbits/document_search/retrieval/rerankers/noop.py +34 -0
  43. ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
  44. ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
  45. ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
  46. ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
@@ -0,0 +1,178 @@
1
+ from docling.chunking import HierarchicalChunker
2
+ from docling.datamodel.base_models import InputFormat
3
+ from docling.datamodel.pipeline_options import AcceleratorOptions, EasyOcrOptions, PdfPipelineOptions, PipelineOptions
4
+ from docling.document_converter import (
5
+ DocumentConverter,
6
+ ExcelFormatOption,
7
+ FormatOption,
8
+ HTMLFormatOption,
9
+ MarkdownFormatOption,
10
+ PdfFormatOption,
11
+ PowerpointFormatOption,
12
+ WordFormatOption,
13
+ )
14
+ from docling_core.transforms.chunker.base import BaseChunker
15
+ from docling_core.types.doc import DocItem, DoclingDocument
16
+
17
+ from ragbits.document_search.documents.document import Document, DocumentType
18
+ from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
19
+ from ragbits.document_search.ingestion.parsers import DocumentParser
20
+
21
+
22
+ class DoclingDocumentParser(DocumentParser):
23
+ """
24
+ Parser that uses the Docling to process the documents.
25
+ """
26
+
27
+ supported_document_types = {
28
+ DocumentType.DOCX,
29
+ DocumentType.PPTX,
30
+ DocumentType.XLSX,
31
+ DocumentType.MD,
32
+ DocumentType.PNG,
33
+ DocumentType.JPG,
34
+ DocumentType.HTML,
35
+ DocumentType.TXT,
36
+ DocumentType.PDF,
37
+ }
38
+
39
+ def __init__(
40
+ self,
41
+ ignore_images: bool = False,
42
+ num_threads: int = 1,
43
+ chunker: BaseChunker | None = None,
44
+ format_options: dict[InputFormat, FormatOption] | None = None,
45
+ ) -> None:
46
+ """
47
+ Initialize the DoclingDocumentParser instance.
48
+
49
+ Args:
50
+ ignore_images: If True images will be skipped.
51
+ num_threads: The number of threads for parsing parallelism on CPU.
52
+ chunker: Custom chunker instance. If None, HierarchicalChunker will be used.
53
+ format_options: Full format options configuration for DocumentConverter.
54
+ If None, default format options will be used.
55
+ """
56
+ self.ignore_images = ignore_images
57
+ self.num_threads = num_threads
58
+ self.chunker = chunker
59
+ self.format_options = format_options
60
+
61
+ async def parse(self, document: Document) -> list[Element]:
62
+ """
63
+ Parse the document using the Docling API.
64
+
65
+ Args:
66
+ document: The document to parse.
67
+
68
+ Returns:
69
+ The list of elements extracted from the document.
70
+ """
71
+ self.validate_document_type(document.metadata.document_type)
72
+ partitioned_document = await self._partition(document)
73
+ return self._chunk(partitioned_document, document)
74
+
75
+ async def _partition(self, document: Document) -> DoclingDocument:
76
+ """
77
+ Partition the document.
78
+
79
+ Args:
80
+ document: The document to parse.
81
+
82
+ Returns:
83
+ The docling document.
84
+
85
+ Raises:
86
+ ConversionError: If converting the document to the Docling format fails.
87
+ """
88
+ # Use provided format_options or create default ones
89
+ if self.format_options is not None:
90
+ converter = DocumentConverter(format_options=self.format_options)
91
+ else:
92
+ # Build default format options
93
+ accelerator_options = AcceleratorOptions(num_threads=self.num_threads)
94
+ pipeline_options = PipelineOptions(accelerator_options=accelerator_options)
95
+ pdf_pipeline_options = PdfPipelineOptions(
96
+ images_scale=2,
97
+ generate_page_images=True,
98
+ accelerator_options=accelerator_options,
99
+ ocr_options=EasyOcrOptions(),
100
+ )
101
+
102
+ converter = DocumentConverter(
103
+ format_options={
104
+ InputFormat.XLSX: ExcelFormatOption(pipeline_options=pipeline_options),
105
+ InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
106
+ InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pipeline_options),
107
+ InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
108
+ InputFormat.MD: MarkdownFormatOption(pipeline_options=pipeline_options),
109
+ InputFormat.IMAGE: PdfFormatOption(pipeline_options=pdf_pipeline_options),
110
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
111
+ },
112
+ )
113
+
114
+ # For txt files, temporarily rename to .md extension. Docling doesn't support text files natively.
115
+ if document.metadata.document_type == DocumentType.TXT:
116
+ original_suffix = document.local_path.suffix
117
+ document.local_path = document.local_path.rename(document.local_path.with_suffix(".md"))
118
+
119
+ partitioned_document = converter.convert(document.local_path).document
120
+
121
+ # Convert back to the original file.
122
+ if document.metadata.document_type == DocumentType.TXT:
123
+ document.local_path = document.local_path.rename(document.local_path.with_suffix(original_suffix))
124
+
125
+ return partitioned_document
126
+
127
+ def _chunk(self, partitioned_document: DoclingDocument, document: Document) -> list[Element]:
128
+ """
129
+ Chunk the partitioned document.
130
+
131
+ Args:
132
+ partitioned_document: The partitioned document by Docling.
133
+ document: The document to parse.
134
+
135
+ Returns:
136
+ The list of chunked elements.
137
+ """
138
+ # Use provided chunker or create default HierarchicalChunker
139
+ chunker = self.chunker or HierarchicalChunker()
140
+
141
+ text_elements: list[Element] = [
142
+ TextElement(
143
+ document_meta=document.metadata,
144
+ location=self._extract_element_location(chunk.meta.doc_items[0]), # type: ignore
145
+ content=chunk.text,
146
+ )
147
+ for chunk in chunker.chunk(partitioned_document)
148
+ ]
149
+
150
+ if self.ignore_images:
151
+ return text_elements
152
+
153
+ return text_elements + [
154
+ ImageElement(
155
+ document_meta=document.metadata,
156
+ location=self._extract_element_location(element),
157
+ image_bytes=image_bytes,
158
+ ocr_extracted_text=element.caption_text(partitioned_document),
159
+ )
160
+ for element in partitioned_document.pictures
161
+ if (image := element.get_image(partitioned_document)) and (image_bytes := image._repr_jpeg_())
162
+ ]
163
+
164
+ @staticmethod
165
+ def _extract_element_location(element: DocItem) -> ElementLocation:
166
+ """
167
+ Convert docling element to element location.
168
+
169
+ Args:
170
+ element: The element from docling.
171
+
172
+ Returns:
173
+ The element location.
174
+ """
175
+ metadata = element.prov[0].model_dump() if element.prov else {}
176
+ return ElementLocation(
177
+ page_number=metadata.get("page_no"),
178
+ )
@@ -0,0 +1,32 @@
1
+ from ragbits.document_search.documents.document import DocumentType
2
+
3
+
4
+ class ParserError(Exception):
5
+ """
6
+ Class for all exceptions raised by the document parser and router.
7
+ """
8
+
9
+ def __init__(self, message: str) -> None:
10
+ super().__init__(message)
11
+ self.message = message
12
+
13
+
14
+ class ParserNotFoundError(ParserError):
15
+ """
16
+ Raised when no parser was found for the document type.
17
+ """
18
+
19
+ def __init__(self, document_type: DocumentType) -> None:
20
+ super().__init__(f"No parser found for the document type {document_type}")
21
+ self.document_type = document_type
22
+
23
+
24
+ class ParserDocumentNotSupportedError(ParserError):
25
+ """
26
+ Raised when the document type is not supported by the parser.
27
+ """
28
+
29
+ def __init__(self, parser_name: str, document_type: DocumentType) -> None:
30
+ super().__init__(f"Document type {document_type.value} is not supported by the {parser_name}")
31
+ self.parser_name = parser_name
32
+ self.document_type = document_type
@@ -0,0 +1,28 @@
1
+ from .callbacks import PptxCallback
2
+ from .exceptions import (
3
+ PptxExtractionError,
4
+ PptxParserError,
5
+ PptxPresentationError,
6
+ )
7
+ from .hyperlink_callback import LinkCallback
8
+ from .metadata_callback import MetaCallback
9
+ from .parser import PptxDocumentParser
10
+ from .speaker_notes_callback import NotesCallback
11
+
12
+ DEFAULT_CALLBACKS = [
13
+ NotesCallback(),
14
+ LinkCallback(),
15
+ MetaCallback(),
16
+ ]
17
+
18
+ __all__ = [
19
+ "DEFAULT_CALLBACKS",
20
+ "LinkCallback",
21
+ "MetaCallback",
22
+ "NotesCallback",
23
+ "PptxCallback",
24
+ "PptxDocumentParser",
25
+ "PptxExtractionError",
26
+ "PptxParserError",
27
+ "PptxPresentationError",
28
+ ]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+
6
+ from docling_core.types.doc import DoclingDocument
7
+ from pptx.presentation import Presentation
8
+
9
+
10
+ class PptxCallback(ABC):
11
+ """
12
+ Abstract base class for PPTX document enhancement callbacks.
13
+ """
14
+
15
+ name: str
16
+
17
+ @abstractmethod
18
+ def __call__(
19
+ self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
20
+ ) -> DoclingDocument:
21
+ """
22
+ Process PPTX presentation and enhance the docling document.
23
+
24
+ Args:
25
+ pptx_path: Path to the PPTX file.
26
+ presentation: Loaded PPTX presentation.
27
+ docling_document: Document to enhance.
28
+
29
+ Returns:
30
+ Enhanced docling document.
31
+ """
32
+ pass
@@ -0,0 +1,52 @@
1
+ from ragbits.document_search.ingestion.parsers.exceptions import ParserError
2
+
3
+
4
+ class PptxParserError(ParserError):
5
+ """
6
+ Base class for all PPTX parser related exceptions.
7
+ """
8
+
9
+
10
+ class PptxExtractionError(PptxParserError):
11
+ """
12
+ Raised when an extractor fails to extract content from a shape or slide.
13
+ """
14
+
15
+ def __init__(self, extractor_name: str, slide_idx: int, shape_info: str, original_error: Exception) -> None:
16
+ """
17
+ Initialize the PptxExtractionError.
18
+
19
+ Args:
20
+ extractor_name: Name of the extractor that failed.
21
+ slide_idx: Index of the slide where extraction failed.
22
+ shape_info: Information about the shape that caused the failure.
23
+ original_error: The original exception that caused the failure.
24
+ """
25
+ message = (
26
+ f"Extractor '{extractor_name}' failed to extract content from slide {slide_idx}. "
27
+ f"Shape info: {shape_info}. Original error: {original_error}"
28
+ )
29
+ super().__init__(message)
30
+ self.extractor_name = extractor_name
31
+ self.slide_idx = slide_idx
32
+ self.shape_info = shape_info
33
+ self.original_error = original_error
34
+
35
+
36
+ class PptxPresentationError(PptxParserError):
37
+ """
38
+ Raised when the PPTX presentation cannot be loaded or processed.
39
+ """
40
+
41
+ def __init__(self, file_path: str, original_error: Exception) -> None:
42
+ """
43
+ Initialize the PptxPresentationError.
44
+
45
+ Args:
46
+ file_path: Path to the PPTX file that failed to load.
47
+ original_error: The original exception that caused the failure.
48
+ """
49
+ message = f"Failed to load or process PPTX presentation from '{file_path}'. Original error: {original_error}"
50
+ super().__init__(message)
51
+ self.file_path = file_path
52
+ self.original_error = original_error
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem
7
+ from pptx.presentation import Presentation
8
+ from pptx.shapes.group import GroupShape
9
+
10
+ from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
11
+ from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LinkCallback(PptxCallback):
17
+ """
18
+ Callback to extract hyperlinks from PPTX shapes.
19
+ """
20
+
21
+ name = "link_callback"
22
+
23
+ def __call__(
24
+ self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
25
+ ) -> DoclingDocument:
26
+ """
27
+ Extract hyperlinks from all shapes and add them to the docling document.
28
+
29
+ Args:
30
+ pptx_path: Path to the PPTX file.
31
+ presentation: Loaded PPTX presentation.
32
+ docling_document: Document to enhance with hyperlinks.
33
+
34
+ Returns:
35
+ Enhanced docling document with hyperlinks.
36
+ """
37
+ hyperlinks_added = 0
38
+
39
+ for slide_idx, slide in enumerate(presentation.slides, start=1):
40
+ for shape in slide.shapes:
41
+ try:
42
+ hyperlink_address = self._extract_hyperlink_address(shape)
43
+ if hyperlink_address:
44
+ link_text = f"Link: {hyperlink_address}"
45
+ hyperlink_item = TextItem(
46
+ self_ref=f"#/links/{slide_idx + hyperlinks_added}",
47
+ text=link_text,
48
+ orig=link_text,
49
+ label=DocItemLabel.TEXT,
50
+ prov=[
51
+ ProvenanceItem(
52
+ page_no=slide_idx,
53
+ bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0),
54
+ charspan=(0, len(link_text)),
55
+ )
56
+ ],
57
+ )
58
+
59
+ docling_document.texts.append(hyperlink_item)
60
+ hyperlinks_added += 1
61
+
62
+ logger.debug("Added hyperlink from slide %d: %s", slide_idx, hyperlink_address)
63
+
64
+ except (AttributeError, TypeError) as e:
65
+ extraction_error = PptxExtractionError(self.name, slide_idx, "hyperlink from shape", e)
66
+ logger.debug(
67
+ "Failed to extract hyperlink from shape on slide %d: %s", slide_idx, str(extraction_error)
68
+ )
69
+ continue
70
+
71
+ if hyperlinks_added > 0:
72
+ logger.info("Successfully added %d hyperlinks to docling document", hyperlinks_added)
73
+ else:
74
+ logger.debug("No hyperlinks found in presentation")
75
+
76
+ return docling_document
77
+
78
+ @staticmethod
79
+ def _extract_hyperlink_address(shape: object) -> str | None:
80
+ if not hasattr(shape, "click_action") or isinstance(shape, GroupShape):
81
+ return None
82
+ if not shape.click_action.hyperlink or not shape.click_action.hyperlink.address:
83
+ return None
84
+ return shape.click_action.hyperlink.address
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem
7
+ from pptx.presentation import Presentation
8
+
9
+ from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
10
+ from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class MetaCallback(PptxCallback):
16
+ """
17
+ Callback to extract presentation metadata from PPTX files.
18
+ """
19
+
20
+ name = "meta_callback"
21
+
22
+ def __call__(
23
+ self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
24
+ ) -> DoclingDocument:
25
+ """
26
+ Extract presentation metadata and add it to the docling document.
27
+
28
+ Args:
29
+ pptx_path: Path to the PPTX file.
30
+ presentation: Loaded PPTX presentation.
31
+ docling_document: Document to enhance with metadata.
32
+
33
+ Returns:
34
+ Enhanced docling document with metadata.
35
+ """
36
+ metadata_added = 0
37
+
38
+ try:
39
+ core_properties = presentation.core_properties
40
+ properties = [
41
+ ("author", core_properties.author),
42
+ ("title", core_properties.title),
43
+ ("subject", core_properties.subject),
44
+ ("keywords", core_properties.keywords),
45
+ ("category", core_properties.category),
46
+ ("created", str(core_properties.created) if core_properties.created else None),
47
+ ("modified", str(core_properties.modified) if core_properties.modified else None),
48
+ ]
49
+
50
+ for prop_name, prop_value in properties:
51
+ if prop_value is not None and str(prop_value).strip():
52
+ meta_text = f"{prop_name}: {prop_value}"
53
+ metadata_item = TextItem(
54
+ self_ref=f"#/metadata/{metadata_added}",
55
+ text=meta_text,
56
+ orig=meta_text,
57
+ label=DocItemLabel.TEXT,
58
+ prov=[
59
+ ProvenanceItem(
60
+ page_no=0, bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0), charspan=(0, len(meta_text))
61
+ )
62
+ ],
63
+ )
64
+
65
+ docling_document.texts.append(metadata_item)
66
+ metadata_added += 1
67
+
68
+ logger.debug("Added metadata: %s = %s", prop_name, prop_value)
69
+ except (AttributeError, TypeError) as e:
70
+ extraction_error = PptxExtractionError(self.name, 0, "presentation metadata", e)
71
+ logger.debug("Failed to extract presentation metadata: %s", str(extraction_error))
72
+
73
+ if metadata_added > 0:
74
+ logger.info("Successfully added %d metadata properties to docling document", metadata_added)
75
+ else:
76
+ logger.debug("No metadata found in presentation")
77
+
78
+ return docling_document
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from docling.datamodel.base_models import InputFormat
6
+ from docling.document_converter import FormatOption
7
+ from docling_core.transforms.chunker.base import BaseChunker
8
+ from docling_core.types.doc import DoclingDocument
9
+ from pptx import Presentation
10
+
11
+ from ragbits.document_search.documents.document import Document, DocumentType
12
+ from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
13
+ from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
14
+ from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError, PptxPresentationError
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class PptxDocumentParser(DoclingDocumentParser):
20
+ """
21
+ Document parser for PPTX files with callback-based enhancement.
22
+ """
23
+
24
+ supported_document_types = {DocumentType.PPTX}
25
+
26
+ def __init__(
27
+ self,
28
+ ignore_images: bool = False,
29
+ num_threads: int = 1,
30
+ chunker: BaseChunker | None = None,
31
+ format_options: dict[InputFormat, FormatOption] | None = None,
32
+ pptx_callbacks: list[PptxCallback] | None = None,
33
+ ) -> None:
34
+ super().__init__(
35
+ ignore_images=ignore_images,
36
+ num_threads=num_threads,
37
+ chunker=chunker,
38
+ format_options=format_options,
39
+ )
40
+
41
+ if pptx_callbacks is None:
42
+ from ragbits.document_search.ingestion.parsers.pptx import DEFAULT_CALLBACKS
43
+
44
+ self.pptx_callbacks = DEFAULT_CALLBACKS
45
+ else:
46
+ self.pptx_callbacks = pptx_callbacks
47
+
48
+ logger.debug("Initialized PptxDocumentParser with %d callbacks", len(self.pptx_callbacks))
49
+
50
+ async def _partition(self, document: Document) -> DoclingDocument:
51
+ docling_document = await super()._partition(document)
52
+
53
+ if not self.pptx_callbacks:
54
+ return docling_document
55
+
56
+ logger.info("Enhancing docling document with %d callbacks", len(self.pptx_callbacks))
57
+
58
+ try:
59
+ presentation = Presentation(document.local_path.as_posix())
60
+ except Exception as e:
61
+ logger.error("Failed to load presentation for callbacks: %s", str(e))
62
+ raise PptxPresentationError(str(document.local_path), e) from e
63
+
64
+ successful_callbacks = 0
65
+ for callback in self.pptx_callbacks:
66
+ try:
67
+ logger.debug("Running callback: %s", callback.name)
68
+ docling_document = callback(document.local_path, presentation, docling_document)
69
+ successful_callbacks += 1
70
+ logger.debug("Successfully applied callback: %s", callback.name)
71
+ except Exception as e:
72
+ extraction_error = PptxExtractionError(callback.name, -1, "callback execution", e)
73
+ logger.error(
74
+ "Callback %s failed: %s. Continuing with other callbacks.",
75
+ callback.name,
76
+ str(extraction_error),
77
+ exc_info=True,
78
+ )
79
+
80
+ logger.info(
81
+ "Enhanced docling document with %d/%d successful callbacks",
82
+ successful_callbacks,
83
+ len(self.pptx_callbacks),
84
+ )
85
+ return docling_document
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem
7
+ from pptx.presentation import Presentation
8
+
9
+ from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback
10
+ from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class NotesCallback(PptxCallback):
16
+ """
17
+ Callback to extract speaker notes from PPTX slides.
18
+ """
19
+
20
+ name = "notes_callback"
21
+
22
+ def __call__(
23
+ self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument
24
+ ) -> DoclingDocument:
25
+ """
26
+ Extract speaker notes from all slides and add them to the docling document.
27
+
28
+ Args:
29
+ pptx_path: Path to the PPTX file.
30
+ presentation: Loaded PPTX presentation.
31
+ docling_document: Document to enhance with speaker notes.
32
+
33
+ Returns:
34
+ Enhanced docling document with speaker notes.
35
+ """
36
+ notes_added = 0
37
+
38
+ for slide_idx, slide in enumerate(presentation.slides, start=1):
39
+ try:
40
+ if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
41
+ notes_text_frame = slide.notes_slide.notes_text_frame
42
+ text = getattr(notes_text_frame, "text", None)
43
+ text = text.strip() if text else None
44
+
45
+ if text:
46
+ notes_item = TextItem(
47
+ self_ref=f"#/notes/{slide_idx}",
48
+ text=text,
49
+ orig=text,
50
+ label=DocItemLabel.TEXT,
51
+ prov=[
52
+ ProvenanceItem(
53
+ page_no=slide_idx,
54
+ bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0),
55
+ charspan=(0, len(text)),
56
+ )
57
+ ],
58
+ )
59
+
60
+ docling_document.texts.append(notes_item)
61
+ notes_added += 1
62
+
63
+ logger.debug("Added speaker notes from slide %d", slide_idx)
64
+
65
+ except (AttributeError, TypeError) as e:
66
+ extraction_error = PptxExtractionError(self.name, slide_idx, "speaker notes", e)
67
+ logger.debug("Failed to extract speaker notes from slide %d: %s", slide_idx, str(extraction_error))
68
+ continue
69
+
70
+ if notes_added > 0:
71
+ logger.info("Successfully added %d speaker notes to docling document", notes_added)
72
+ else:
73
+ logger.debug("No speaker notes found in presentation")
74
+
75
+ return docling_document