docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,230 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from collections.abc import Iterable
4
+ from typing import Any, Generic, Optional, Protocol, Type, Union
5
+
6
+ import numpy as np
7
+ from docling_core.types.doc import (
8
+ BoundingBox,
9
+ DocItem,
10
+ DoclingDocument,
11
+ NodeItem,
12
+ PictureItem,
13
+ )
14
+ from PIL.Image import Image
15
+ from typing_extensions import TypeVar
16
+
17
+ from docling.datamodel.base_models import (
18
+ ItemAndImageEnrichmentElement,
19
+ Page,
20
+ VlmPrediction,
21
+ )
22
+ from docling.datamodel.document import ConversionResult
23
+ from docling.datamodel.pipeline_options import BaseOptions
24
+ from docling.datamodel.pipeline_options_vlm_model import (
25
+ InlineVlmOptions,
26
+ TransformersPromptStyle,
27
+ )
28
+ from docling.datamodel.settings import settings
29
+
30
+
31
+ class BaseModelWithOptions(Protocol):
32
+ @classmethod
33
+ def get_options_type(cls) -> Type[BaseOptions]: ...
34
+
35
+ def __init__(self, *, options: BaseOptions, **kwargs): ...
36
+
37
+
38
+ class BasePageModel(ABC):
39
+ @abstractmethod
40
+ def __call__(
41
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
42
+ ) -> Iterable[Page]:
43
+ pass
44
+
45
+
46
+ class BaseVlmModel(ABC):
47
+ """Base class for Vision-Language Models that adds image processing capability."""
48
+
49
+ @abstractmethod
50
+ def process_images(
51
+ self,
52
+ image_batch: Iterable[Union[Image, np.ndarray]],
53
+ prompt: Union[str, list[str]],
54
+ ) -> Iterable[VlmPrediction]:
55
+ """Process raw images without page metadata.
56
+
57
+ Args:
58
+ image_batch: Iterable of PIL Images or numpy arrays
59
+ prompt: Either:
60
+ - str: Single prompt used for all images
61
+ - list[str]: List of prompts (one per image, must match image count)
62
+
63
+ Raises:
64
+ ValueError: If prompt list length doesn't match image count.
65
+ """
66
+
67
+
68
+ class BaseVlmPageModel(BasePageModel, BaseVlmModel):
69
+ """Base implementation for VLM models that inherit from BasePageModel.
70
+
71
+ Provides a default __call__ implementation that extracts images from pages,
72
+ processes them using process_images, and attaches results back to pages.
73
+ """
74
+
75
+ # Type annotations for attributes that subclasses must initialize
76
+ vlm_options: InlineVlmOptions
77
+ processor: Any
78
+
79
+ def _build_prompt_safe(self, page: Page) -> str:
80
+ """Build prompt with backward compatibility for user overrides.
81
+
82
+ Tries to call build_prompt with _internal_page parameter (for layout-aware
83
+ pipelines). Falls back to basic call if user override doesn't accept it.
84
+
85
+ Args:
86
+ page: The full Page object with layout predictions and parsed_page.
87
+
88
+ Returns:
89
+ The formatted prompt string.
90
+ """
91
+ try:
92
+ return self.vlm_options.build_prompt(page.parsed_page, _internal_page=page)
93
+ except TypeError:
94
+ # User override doesn't accept _internal_page - fall back to basic call
95
+ return self.vlm_options.build_prompt(page.parsed_page)
96
+
97
+ @abstractmethod
98
+ def __call__(
99
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
100
+ ) -> Iterable[Page]:
101
+ """Extract images from pages, process them, and attach results back."""
102
+
103
+ def formulate_prompt(self, user_prompt: str) -> str:
104
+ """Formulate a prompt for the VLM."""
105
+ _log = logging.getLogger(__name__)
106
+
107
+ if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
108
+ return user_prompt
109
+ elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
110
+ return ""
111
+ elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
112
+ _log.debug("Using specialized prompt for Phi-4")
113
+ # Note: This might need adjustment for VLLM vs transformers
114
+ user_prompt_prefix = "<|user|>"
115
+ assistant_prompt = "<|assistant|>"
116
+ prompt_suffix = "<|end|>"
117
+
118
+ prompt = f"{user_prompt_prefix}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
119
+ _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
120
+
121
+ return prompt
122
+
123
+ elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
124
+ messages = [
125
+ {
126
+ "role": "user",
127
+ "content": [
128
+ {
129
+ "type": "text",
130
+ "text": "This is a page from a document.",
131
+ },
132
+ {"type": "image"},
133
+ {"type": "text", "text": user_prompt},
134
+ ],
135
+ }
136
+ ]
137
+ prompt = self.processor.apply_chat_template(
138
+ messages, add_generation_prompt=True
139
+ )
140
+ return prompt
141
+
142
+ raise RuntimeError(
143
+ f"Unknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
144
+ )
145
+
146
+
147
+ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
148
+
149
+
150
+ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
151
+ elements_batch_size: int = settings.perf.elements_batch_size
152
+
153
+ @abstractmethod
154
+ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
155
+ pass
156
+
157
+ @abstractmethod
158
+ def prepare_element(
159
+ self, conv_res: ConversionResult, element: NodeItem
160
+ ) -> Optional[EnrichElementT]:
161
+ pass
162
+
163
+ @abstractmethod
164
+ def __call__(
165
+ self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
166
+ ) -> Iterable[NodeItem]:
167
+ pass
168
+
169
+
170
+ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
171
+ def prepare_element(
172
+ self, conv_res: ConversionResult, element: NodeItem
173
+ ) -> Optional[NodeItem]:
174
+ if self.is_processable(doc=conv_res.document, element=element):
175
+ return element
176
+ return None
177
+
178
+
179
+ class BaseItemAndImageEnrichmentModel(
180
+ GenericEnrichmentModel[ItemAndImageEnrichmentElement]
181
+ ):
182
+ images_scale: float
183
+ expansion_factor: float = 0.0
184
+
185
+ def prepare_element(
186
+ self, conv_res: ConversionResult, element: NodeItem
187
+ ) -> Optional[ItemAndImageEnrichmentElement]:
188
+ if not self.is_processable(doc=conv_res.document, element=element):
189
+ return None
190
+
191
+ assert isinstance(element, DocItem)
192
+
193
+ # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
194
+ if isinstance(element, PictureItem):
195
+ embedded_im = element.get_image(conv_res.document)
196
+ if embedded_im is not None:
197
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
198
+ elif len(element.prov) == 0:
199
+ return None
200
+
201
+ # Crop the image form the page
202
+ element_prov = element.prov[0]
203
+ bbox = element_prov.bbox
204
+ width = bbox.r - bbox.l
205
+ height = bbox.t - bbox.b
206
+
207
+ # TODO: move to a utility in the BoundingBox class
208
+ expanded_bbox = BoundingBox(
209
+ l=bbox.l - width * self.expansion_factor,
210
+ t=bbox.t + height * self.expansion_factor,
211
+ r=bbox.r + width * self.expansion_factor,
212
+ b=bbox.b - height * self.expansion_factor,
213
+ coord_origin=bbox.coord_origin,
214
+ )
215
+
216
+ page_ix = element_prov.page_no - conv_res.pages[0].page_no
217
+ cropped_image = conv_res.pages[page_ix].get_image(
218
+ scale=self.images_scale, cropbox=expanded_bbox
219
+ )
220
+
221
+ # Allow for images being embedded without the page backend or page images
222
+ if cropped_image is None and isinstance(element, PictureItem):
223
+ embedded_im = element.get_image(conv_res.document)
224
+ if embedded_im is not None:
225
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
226
+ else:
227
+ return None
228
+
229
+ # Return the proper cropped image
230
+ return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
@@ -0,0 +1,241 @@
1
+ import copy
2
+ import logging
3
+ from abc import abstractmethod
4
+ from collections.abc import Iterable
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, List, Optional, Type
7
+
8
+ import numpy as np
9
+ from docling_core.types.doc import BoundingBox, CoordOrigin
10
+ from docling_core.types.doc.page import TextCell
11
+ from PIL import Image, ImageDraw
12
+ from rtree import index
13
+
14
+ from docling.datamodel.accelerator_options import AcceleratorOptions
15
+ from docling.datamodel.base_models import Page
16
+ from docling.datamodel.document import ConversionResult
17
+ from docling.datamodel.pipeline_options import OcrOptions
18
+ from docling.datamodel.settings import settings
19
+ from docling.models.base_model import BaseModelWithOptions, BasePageModel
20
+
21
+ _log = logging.getLogger(__name__)
22
+
23
+
24
+ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
25
+ def __init__(
26
+ self,
27
+ *,
28
+ enabled: bool,
29
+ artifacts_path: Optional[Path],
30
+ options: OcrOptions,
31
+ accelerator_options: AcceleratorOptions,
32
+ ):
33
+ # Make sure any delay/error from import occurs on ocr model init and not first use
34
+ from scipy.ndimage import binary_dilation, find_objects, label
35
+
36
+ self.enabled = enabled
37
+ self.options = options
38
+
39
+ # Computes the optimum amount and coordinates of rectangles to OCR on a given page
40
+ def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
41
+ from scipy.ndimage import binary_dilation, find_objects, label
42
+
43
+ BITMAP_COVERAGE_TRESHOLD = 0.75
44
+ assert page.size is not None
45
+
46
+ def find_ocr_rects(size, bitmap_rects):
47
+ image = Image.new(
48
+ "1", (round(size.width), round(size.height))
49
+ ) # '1' mode is binary
50
+
51
+ # Draw all bitmap rects into a binary image
52
+ draw = ImageDraw.Draw(image)
53
+ for rect in bitmap_rects:
54
+ x0, y0, x1, y1 = rect.as_tuple()
55
+ x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
56
+ draw.rectangle([(x0, y0), (x1, y1)], fill=1)
57
+
58
+ np_image = np.array(image)
59
+
60
+ # Dilate the image by 10 pixels to merge nearby bitmap rectangles
61
+ structure = np.ones(
62
+ (20, 20)
63
+ ) # Create a 20x20 structure element (10 pixels in all directions)
64
+ np_image = binary_dilation(np_image > 0, structure=structure)
65
+
66
+ # Find the connected components
67
+ labeled_image, num_features = label(
68
+ np_image > 0
69
+ ) # Label black (0 value) regions
70
+
71
+ # Find enclosing bounding boxes for each connected component.
72
+ slices = find_objects(labeled_image)
73
+ bounding_boxes = [
74
+ BoundingBox(
75
+ l=slc[1].start,
76
+ t=slc[0].start,
77
+ r=slc[1].stop - 1,
78
+ b=slc[0].stop - 1,
79
+ coord_origin=CoordOrigin.TOPLEFT,
80
+ )
81
+ for slc in slices
82
+ ]
83
+
84
+ # Compute area fraction on page covered by bitmaps
85
+ area_frac = np.sum(np_image > 0) / (size.width * size.height)
86
+
87
+ return (area_frac, bounding_boxes) # fraction covered # boxes
88
+
89
+ if page._backend is not None:
90
+ bitmap_rects = page._backend.get_bitmap_rects()
91
+ else:
92
+ bitmap_rects = []
93
+ coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
94
+
95
+ # return full-page rectangle if page is dominantly covered with bitmaps
96
+ if self.options.force_full_page_ocr or coverage > max(
97
+ BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
98
+ ):
99
+ return [
100
+ BoundingBox(
101
+ l=0,
102
+ t=0,
103
+ r=page.size.width,
104
+ b=page.size.height,
105
+ coord_origin=CoordOrigin.TOPLEFT,
106
+ )
107
+ ]
108
+ # return individual rectangles if the bitmap coverage is above the threshold
109
+ elif coverage > self.options.bitmap_area_threshold:
110
+ return ocr_rects
111
+ else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
112
+ return []
113
+
114
+ # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
115
+ def _filter_ocr_cells(
116
+ self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
117
+ ) -> List[TextCell]:
118
+ # Create R-tree index for programmatic cells
119
+ p = index.Property()
120
+ p.dimension = 2
121
+ idx = index.Index(properties=p)
122
+ for i, cell in enumerate(programmatic_cells):
123
+ idx.insert(i, cell.rect.to_bounding_box().as_tuple())
124
+
125
+ def is_overlapping_with_existing_cells(ocr_cell):
126
+ # Query the R-tree to get overlapping rectangles
127
+ possible_matches_index = list(
128
+ idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
129
+ )
130
+
131
+ return (
132
+ len(possible_matches_index) > 0
133
+ ) # this is a weak criterion but it works.
134
+
135
+ filtered_ocr_cells = [
136
+ rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
137
+ ]
138
+ return filtered_ocr_cells
139
+
140
+ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
141
+ r"""
142
+ Post-process the OCR cells and update the page object.
143
+ Updates parsed_page.textline_cells directly since page.cells is now read-only.
144
+ """
145
+ # Get existing cells from the read-only property
146
+ existing_cells = page.cells
147
+
148
+ # Combine existing and OCR cells with overlap filtering
149
+ final_cells = self._combine_cells(existing_cells, ocr_cells)
150
+
151
+ assert page.parsed_page is not None
152
+
153
+ # Update parsed_page.textline_cells directly
154
+ page.parsed_page.textline_cells = final_cells
155
+ page.parsed_page.has_lines = len(final_cells) > 0
156
+
157
+ # When force_full_page_ocr is used, PDF-extracted word/char cells are
158
+ # unreliable. Filter out cells where from_ocr=False, keeping any OCR-
159
+ # generated cells. This ensures downstream components (e.g., table
160
+ # structure model) fall back to OCR-extracted textline cells.
161
+ if self.options.force_full_page_ocr:
162
+ page.parsed_page.word_cells = [
163
+ c for c in page.parsed_page.word_cells if c.from_ocr
164
+ ]
165
+ page.parsed_page.char_cells = [
166
+ c for c in page.parsed_page.char_cells if c.from_ocr
167
+ ]
168
+ page.parsed_page.has_words = len(page.parsed_page.word_cells) > 0
169
+ page.parsed_page.has_chars = len(page.parsed_page.char_cells) > 0
170
+
171
+ def _combine_cells(
172
+ self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
173
+ ) -> List[TextCell]:
174
+ """Combine existing and OCR cells with filtering and re-indexing."""
175
+ if self.options.force_full_page_ocr:
176
+ combined = ocr_cells
177
+ else:
178
+ filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
179
+ combined = list(existing_cells) + filtered_ocr_cells
180
+
181
+ # Re-index in-place
182
+ for i, cell in enumerate(combined):
183
+ cell.index = i
184
+
185
+ return combined
186
+
187
+ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
188
+ image = copy.deepcopy(page.image)
189
+ scale_x = image.width / page.size.width
190
+ scale_y = image.height / page.size.height
191
+
192
+ draw = ImageDraw.Draw(image, "RGBA")
193
+
194
+ # Draw OCR rectangles as yellow filled rect
195
+ for rect in ocr_rects:
196
+ x0, y0, x1, y1 = rect.as_tuple()
197
+ y0 *= scale_x
198
+ y1 *= scale_y
199
+ x0 *= scale_x
200
+ x1 *= scale_x
201
+
202
+ shade_color = (255, 255, 0, 40) # transparent yellow
203
+ draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
204
+
205
+ # Draw OCR and programmatic cells
206
+ for tc in page.cells:
207
+ x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
208
+ y0 *= scale_x
209
+ y1 *= scale_y
210
+ x0 *= scale_x
211
+ x1 *= scale_x
212
+
213
+ if y1 <= y0:
214
+ y1, y0 = y0, y1
215
+
216
+ color = "magenta" if tc.from_ocr else "gray"
217
+
218
+ draw.rectangle([(x0, y0), (x1, y1)], outline=color)
219
+
220
+ if show:
221
+ image.show()
222
+ else:
223
+ out_path: Path = (
224
+ Path(settings.debug.debug_output_path)
225
+ / f"debug_{conv_res.input.file.stem}"
226
+ )
227
+ out_path.mkdir(parents=True, exist_ok=True)
228
+
229
+ out_file = out_path / f"ocr_page_{page.page_no:05}.png"
230
+ image.save(str(out_file), format="png")
231
+
232
+ @abstractmethod
233
+ def __call__(
234
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
235
+ ) -> Iterable[Page]:
236
+ pass
237
+
238
+ @classmethod
239
+ @abstractmethod
240
+ def get_options_type(cls) -> Type[OcrOptions]:
241
+ pass
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Iterable, Sequence
5
+ from typing import Type
6
+
7
+ from docling.datamodel.base_models import Page, TableStructurePrediction
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import BaseTableStructureOptions
10
+ from docling.models.base_model import BaseModelWithOptions, BasePageModel
11
+
12
+
13
+ class BaseTableStructureModel(BasePageModel, BaseModelWithOptions, ABC):
14
+ """Shared interface for table structure models."""
15
+
16
+ enabled: bool
17
+
18
+ @classmethod
19
+ @abstractmethod
20
+ def get_options_type(cls) -> Type[BaseTableStructureOptions]:
21
+ """Return the options type supported by this table model."""
22
+
23
+ @abstractmethod
24
+ def predict_tables(
25
+ self,
26
+ conv_res: ConversionResult,
27
+ pages: Sequence[Page],
28
+ ) -> Sequence[TableStructurePrediction]:
29
+ """Produce table structure predictions for the provided pages."""
30
+
31
+ def __call__(
32
+ self,
33
+ conv_res: ConversionResult,
34
+ page_batch: Iterable[Page],
35
+ ) -> Iterable[Page]:
36
+ if not getattr(self, "enabled", True):
37
+ yield from page_batch
38
+ return
39
+
40
+ pages = list(page_batch)
41
+ predictions = self.predict_tables(conv_res, pages)
42
+
43
+ for page, prediction in zip(pages, predictions):
44
+ page.predictions.tablestructure = prediction
45
+ yield page
File without changes