docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from typing import Any, Generic, Optional, Protocol, Type, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from docling_core.types.doc import (
|
|
8
|
+
BoundingBox,
|
|
9
|
+
DocItem,
|
|
10
|
+
DoclingDocument,
|
|
11
|
+
NodeItem,
|
|
12
|
+
PictureItem,
|
|
13
|
+
)
|
|
14
|
+
from PIL.Image import Image
|
|
15
|
+
from typing_extensions import TypeVar
|
|
16
|
+
|
|
17
|
+
from docling.datamodel.base_models import (
|
|
18
|
+
ItemAndImageEnrichmentElement,
|
|
19
|
+
Page,
|
|
20
|
+
VlmPrediction,
|
|
21
|
+
)
|
|
22
|
+
from docling.datamodel.document import ConversionResult
|
|
23
|
+
from docling.datamodel.pipeline_options import BaseOptions
|
|
24
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
|
25
|
+
InlineVlmOptions,
|
|
26
|
+
TransformersPromptStyle,
|
|
27
|
+
)
|
|
28
|
+
from docling.datamodel.settings import settings
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseModelWithOptions(Protocol):
|
|
32
|
+
@classmethod
|
|
33
|
+
def get_options_type(cls) -> Type[BaseOptions]: ...
|
|
34
|
+
|
|
35
|
+
def __init__(self, *, options: BaseOptions, **kwargs): ...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class BasePageModel(ABC):
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def __call__(
|
|
41
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
42
|
+
) -> Iterable[Page]:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BaseVlmModel(ABC):
|
|
47
|
+
"""Base class for Vision-Language Models that adds image processing capability."""
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def process_images(
|
|
51
|
+
self,
|
|
52
|
+
image_batch: Iterable[Union[Image, np.ndarray]],
|
|
53
|
+
prompt: Union[str, list[str]],
|
|
54
|
+
) -> Iterable[VlmPrediction]:
|
|
55
|
+
"""Process raw images without page metadata.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
image_batch: Iterable of PIL Images or numpy arrays
|
|
59
|
+
prompt: Either:
|
|
60
|
+
- str: Single prompt used for all images
|
|
61
|
+
- list[str]: List of prompts (one per image, must match image count)
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
ValueError: If prompt list length doesn't match image count.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BaseVlmPageModel(BasePageModel, BaseVlmModel):
|
|
69
|
+
"""Base implementation for VLM models that inherit from BasePageModel.
|
|
70
|
+
|
|
71
|
+
Provides a default __call__ implementation that extracts images from pages,
|
|
72
|
+
processes them using process_images, and attaches results back to pages.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
# Type annotations for attributes that subclasses must initialize
|
|
76
|
+
vlm_options: InlineVlmOptions
|
|
77
|
+
processor: Any
|
|
78
|
+
|
|
79
|
+
def _build_prompt_safe(self, page: Page) -> str:
|
|
80
|
+
"""Build prompt with backward compatibility for user overrides.
|
|
81
|
+
|
|
82
|
+
Tries to call build_prompt with _internal_page parameter (for layout-aware
|
|
83
|
+
pipelines). Falls back to basic call if user override doesn't accept it.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
page: The full Page object with layout predictions and parsed_page.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The formatted prompt string.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
return self.vlm_options.build_prompt(page.parsed_page, _internal_page=page)
|
|
93
|
+
except TypeError:
|
|
94
|
+
# User override doesn't accept _internal_page - fall back to basic call
|
|
95
|
+
return self.vlm_options.build_prompt(page.parsed_page)
|
|
96
|
+
|
|
97
|
+
@abstractmethod
|
|
98
|
+
def __call__(
|
|
99
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
100
|
+
) -> Iterable[Page]:
|
|
101
|
+
"""Extract images from pages, process them, and attach results back."""
|
|
102
|
+
|
|
103
|
+
def formulate_prompt(self, user_prompt: str) -> str:
|
|
104
|
+
"""Formulate a prompt for the VLM."""
|
|
105
|
+
_log = logging.getLogger(__name__)
|
|
106
|
+
|
|
107
|
+
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
|
|
108
|
+
return user_prompt
|
|
109
|
+
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
|
|
110
|
+
return ""
|
|
111
|
+
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
|
112
|
+
_log.debug("Using specialized prompt for Phi-4")
|
|
113
|
+
# Note: This might need adjustment for VLLM vs transformers
|
|
114
|
+
user_prompt_prefix = "<|user|>"
|
|
115
|
+
assistant_prompt = "<|assistant|>"
|
|
116
|
+
prompt_suffix = "<|end|>"
|
|
117
|
+
|
|
118
|
+
prompt = f"{user_prompt_prefix}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
|
|
119
|
+
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
|
|
120
|
+
|
|
121
|
+
return prompt
|
|
122
|
+
|
|
123
|
+
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
|
|
124
|
+
messages = [
|
|
125
|
+
{
|
|
126
|
+
"role": "user",
|
|
127
|
+
"content": [
|
|
128
|
+
{
|
|
129
|
+
"type": "text",
|
|
130
|
+
"text": "This is a page from a document.",
|
|
131
|
+
},
|
|
132
|
+
{"type": "image"},
|
|
133
|
+
{"type": "text", "text": user_prompt},
|
|
134
|
+
],
|
|
135
|
+
}
|
|
136
|
+
]
|
|
137
|
+
prompt = self.processor.apply_chat_template(
|
|
138
|
+
messages, add_generation_prompt=True
|
|
139
|
+
)
|
|
140
|
+
return prompt
|
|
141
|
+
|
|
142
|
+
raise RuntimeError(
|
|
143
|
+
f"Unknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
|
151
|
+
elements_batch_size: int = settings.perf.elements_batch_size
|
|
152
|
+
|
|
153
|
+
@abstractmethod
|
|
154
|
+
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
@abstractmethod
|
|
158
|
+
def prepare_element(
|
|
159
|
+
self, conv_res: ConversionResult, element: NodeItem
|
|
160
|
+
) -> Optional[EnrichElementT]:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
@abstractmethod
|
|
164
|
+
def __call__(
|
|
165
|
+
self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
|
|
166
|
+
) -> Iterable[NodeItem]:
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
|
171
|
+
def prepare_element(
|
|
172
|
+
self, conv_res: ConversionResult, element: NodeItem
|
|
173
|
+
) -> Optional[NodeItem]:
|
|
174
|
+
if self.is_processable(doc=conv_res.document, element=element):
|
|
175
|
+
return element
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class BaseItemAndImageEnrichmentModel(
|
|
180
|
+
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
|
181
|
+
):
|
|
182
|
+
images_scale: float
|
|
183
|
+
expansion_factor: float = 0.0
|
|
184
|
+
|
|
185
|
+
def prepare_element(
|
|
186
|
+
self, conv_res: ConversionResult, element: NodeItem
|
|
187
|
+
) -> Optional[ItemAndImageEnrichmentElement]:
|
|
188
|
+
if not self.is_processable(doc=conv_res.document, element=element):
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
assert isinstance(element, DocItem)
|
|
192
|
+
|
|
193
|
+
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
|
|
194
|
+
if isinstance(element, PictureItem):
|
|
195
|
+
embedded_im = element.get_image(conv_res.document)
|
|
196
|
+
if embedded_im is not None:
|
|
197
|
+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
|
198
|
+
elif len(element.prov) == 0:
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
# Crop the image form the page
|
|
202
|
+
element_prov = element.prov[0]
|
|
203
|
+
bbox = element_prov.bbox
|
|
204
|
+
width = bbox.r - bbox.l
|
|
205
|
+
height = bbox.t - bbox.b
|
|
206
|
+
|
|
207
|
+
# TODO: move to a utility in the BoundingBox class
|
|
208
|
+
expanded_bbox = BoundingBox(
|
|
209
|
+
l=bbox.l - width * self.expansion_factor,
|
|
210
|
+
t=bbox.t + height * self.expansion_factor,
|
|
211
|
+
r=bbox.r + width * self.expansion_factor,
|
|
212
|
+
b=bbox.b - height * self.expansion_factor,
|
|
213
|
+
coord_origin=bbox.coord_origin,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
page_ix = element_prov.page_no - conv_res.pages[0].page_no
|
|
217
|
+
cropped_image = conv_res.pages[page_ix].get_image(
|
|
218
|
+
scale=self.images_scale, cropbox=expanded_bbox
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Allow for images being embedded without the page backend or page images
|
|
222
|
+
if cropped_image is None and isinstance(element, PictureItem):
|
|
223
|
+
embedded_im = element.get_image(conv_res.document)
|
|
224
|
+
if embedded_im is not None:
|
|
225
|
+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
|
226
|
+
else:
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
# Return the proper cropped image
|
|
230
|
+
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import logging
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Type
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
10
|
+
from docling_core.types.doc.page import TextCell
|
|
11
|
+
from PIL import Image, ImageDraw
|
|
12
|
+
from rtree import index
|
|
13
|
+
|
|
14
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
15
|
+
from docling.datamodel.base_models import Page
|
|
16
|
+
from docling.datamodel.document import ConversionResult
|
|
17
|
+
from docling.datamodel.pipeline_options import OcrOptions
|
|
18
|
+
from docling.datamodel.settings import settings
|
|
19
|
+
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
|
20
|
+
|
|
21
|
+
_log = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
enabled: bool,
|
|
29
|
+
artifacts_path: Optional[Path],
|
|
30
|
+
options: OcrOptions,
|
|
31
|
+
accelerator_options: AcceleratorOptions,
|
|
32
|
+
):
|
|
33
|
+
# Make sure any delay/error from import occurs on ocr model init and not first use
|
|
34
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
|
35
|
+
|
|
36
|
+
self.enabled = enabled
|
|
37
|
+
self.options = options
|
|
38
|
+
|
|
39
|
+
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
|
40
|
+
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
|
41
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
|
42
|
+
|
|
43
|
+
BITMAP_COVERAGE_TRESHOLD = 0.75
|
|
44
|
+
assert page.size is not None
|
|
45
|
+
|
|
46
|
+
def find_ocr_rects(size, bitmap_rects):
|
|
47
|
+
image = Image.new(
|
|
48
|
+
"1", (round(size.width), round(size.height))
|
|
49
|
+
) # '1' mode is binary
|
|
50
|
+
|
|
51
|
+
# Draw all bitmap rects into a binary image
|
|
52
|
+
draw = ImageDraw.Draw(image)
|
|
53
|
+
for rect in bitmap_rects:
|
|
54
|
+
x0, y0, x1, y1 = rect.as_tuple()
|
|
55
|
+
x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
|
|
56
|
+
draw.rectangle([(x0, y0), (x1, y1)], fill=1)
|
|
57
|
+
|
|
58
|
+
np_image = np.array(image)
|
|
59
|
+
|
|
60
|
+
# Dilate the image by 10 pixels to merge nearby bitmap rectangles
|
|
61
|
+
structure = np.ones(
|
|
62
|
+
(20, 20)
|
|
63
|
+
) # Create a 20x20 structure element (10 pixels in all directions)
|
|
64
|
+
np_image = binary_dilation(np_image > 0, structure=structure)
|
|
65
|
+
|
|
66
|
+
# Find the connected components
|
|
67
|
+
labeled_image, num_features = label(
|
|
68
|
+
np_image > 0
|
|
69
|
+
) # Label black (0 value) regions
|
|
70
|
+
|
|
71
|
+
# Find enclosing bounding boxes for each connected component.
|
|
72
|
+
slices = find_objects(labeled_image)
|
|
73
|
+
bounding_boxes = [
|
|
74
|
+
BoundingBox(
|
|
75
|
+
l=slc[1].start,
|
|
76
|
+
t=slc[0].start,
|
|
77
|
+
r=slc[1].stop - 1,
|
|
78
|
+
b=slc[0].stop - 1,
|
|
79
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
80
|
+
)
|
|
81
|
+
for slc in slices
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
# Compute area fraction on page covered by bitmaps
|
|
85
|
+
area_frac = np.sum(np_image > 0) / (size.width * size.height)
|
|
86
|
+
|
|
87
|
+
return (area_frac, bounding_boxes) # fraction covered # boxes
|
|
88
|
+
|
|
89
|
+
if page._backend is not None:
|
|
90
|
+
bitmap_rects = page._backend.get_bitmap_rects()
|
|
91
|
+
else:
|
|
92
|
+
bitmap_rects = []
|
|
93
|
+
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
|
94
|
+
|
|
95
|
+
# return full-page rectangle if page is dominantly covered with bitmaps
|
|
96
|
+
if self.options.force_full_page_ocr or coverage > max(
|
|
97
|
+
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
|
98
|
+
):
|
|
99
|
+
return [
|
|
100
|
+
BoundingBox(
|
|
101
|
+
l=0,
|
|
102
|
+
t=0,
|
|
103
|
+
r=page.size.width,
|
|
104
|
+
b=page.size.height,
|
|
105
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
106
|
+
)
|
|
107
|
+
]
|
|
108
|
+
# return individual rectangles if the bitmap coverage is above the threshold
|
|
109
|
+
elif coverage > self.options.bitmap_area_threshold:
|
|
110
|
+
return ocr_rects
|
|
111
|
+
else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
|
115
|
+
def _filter_ocr_cells(
|
|
116
|
+
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
|
|
117
|
+
) -> List[TextCell]:
|
|
118
|
+
# Create R-tree index for programmatic cells
|
|
119
|
+
p = index.Property()
|
|
120
|
+
p.dimension = 2
|
|
121
|
+
idx = index.Index(properties=p)
|
|
122
|
+
for i, cell in enumerate(programmatic_cells):
|
|
123
|
+
idx.insert(i, cell.rect.to_bounding_box().as_tuple())
|
|
124
|
+
|
|
125
|
+
def is_overlapping_with_existing_cells(ocr_cell):
|
|
126
|
+
# Query the R-tree to get overlapping rectangles
|
|
127
|
+
possible_matches_index = list(
|
|
128
|
+
idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return (
|
|
132
|
+
len(possible_matches_index) > 0
|
|
133
|
+
) # this is a weak criterion but it works.
|
|
134
|
+
|
|
135
|
+
filtered_ocr_cells = [
|
|
136
|
+
rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
|
|
137
|
+
]
|
|
138
|
+
return filtered_ocr_cells
|
|
139
|
+
|
|
140
|
+
def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
|
|
141
|
+
r"""
|
|
142
|
+
Post-process the OCR cells and update the page object.
|
|
143
|
+
Updates parsed_page.textline_cells directly since page.cells is now read-only.
|
|
144
|
+
"""
|
|
145
|
+
# Get existing cells from the read-only property
|
|
146
|
+
existing_cells = page.cells
|
|
147
|
+
|
|
148
|
+
# Combine existing and OCR cells with overlap filtering
|
|
149
|
+
final_cells = self._combine_cells(existing_cells, ocr_cells)
|
|
150
|
+
|
|
151
|
+
assert page.parsed_page is not None
|
|
152
|
+
|
|
153
|
+
# Update parsed_page.textline_cells directly
|
|
154
|
+
page.parsed_page.textline_cells = final_cells
|
|
155
|
+
page.parsed_page.has_lines = len(final_cells) > 0
|
|
156
|
+
|
|
157
|
+
# When force_full_page_ocr is used, PDF-extracted word/char cells are
|
|
158
|
+
# unreliable. Filter out cells where from_ocr=False, keeping any OCR-
|
|
159
|
+
# generated cells. This ensures downstream components (e.g., table
|
|
160
|
+
# structure model) fall back to OCR-extracted textline cells.
|
|
161
|
+
if self.options.force_full_page_ocr:
|
|
162
|
+
page.parsed_page.word_cells = [
|
|
163
|
+
c for c in page.parsed_page.word_cells if c.from_ocr
|
|
164
|
+
]
|
|
165
|
+
page.parsed_page.char_cells = [
|
|
166
|
+
c for c in page.parsed_page.char_cells if c.from_ocr
|
|
167
|
+
]
|
|
168
|
+
page.parsed_page.has_words = len(page.parsed_page.word_cells) > 0
|
|
169
|
+
page.parsed_page.has_chars = len(page.parsed_page.char_cells) > 0
|
|
170
|
+
|
|
171
|
+
def _combine_cells(
|
|
172
|
+
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
|
|
173
|
+
) -> List[TextCell]:
|
|
174
|
+
"""Combine existing and OCR cells with filtering and re-indexing."""
|
|
175
|
+
if self.options.force_full_page_ocr:
|
|
176
|
+
combined = ocr_cells
|
|
177
|
+
else:
|
|
178
|
+
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
|
|
179
|
+
combined = list(existing_cells) + filtered_ocr_cells
|
|
180
|
+
|
|
181
|
+
# Re-index in-place
|
|
182
|
+
for i, cell in enumerate(combined):
|
|
183
|
+
cell.index = i
|
|
184
|
+
|
|
185
|
+
return combined
|
|
186
|
+
|
|
187
|
+
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
|
188
|
+
image = copy.deepcopy(page.image)
|
|
189
|
+
scale_x = image.width / page.size.width
|
|
190
|
+
scale_y = image.height / page.size.height
|
|
191
|
+
|
|
192
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
|
193
|
+
|
|
194
|
+
# Draw OCR rectangles as yellow filled rect
|
|
195
|
+
for rect in ocr_rects:
|
|
196
|
+
x0, y0, x1, y1 = rect.as_tuple()
|
|
197
|
+
y0 *= scale_x
|
|
198
|
+
y1 *= scale_y
|
|
199
|
+
x0 *= scale_x
|
|
200
|
+
x1 *= scale_x
|
|
201
|
+
|
|
202
|
+
shade_color = (255, 255, 0, 40) # transparent yellow
|
|
203
|
+
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
|
204
|
+
|
|
205
|
+
# Draw OCR and programmatic cells
|
|
206
|
+
for tc in page.cells:
|
|
207
|
+
x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
|
|
208
|
+
y0 *= scale_x
|
|
209
|
+
y1 *= scale_y
|
|
210
|
+
x0 *= scale_x
|
|
211
|
+
x1 *= scale_x
|
|
212
|
+
|
|
213
|
+
if y1 <= y0:
|
|
214
|
+
y1, y0 = y0, y1
|
|
215
|
+
|
|
216
|
+
color = "magenta" if tc.from_ocr else "gray"
|
|
217
|
+
|
|
218
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
|
219
|
+
|
|
220
|
+
if show:
|
|
221
|
+
image.show()
|
|
222
|
+
else:
|
|
223
|
+
out_path: Path = (
|
|
224
|
+
Path(settings.debug.debug_output_path)
|
|
225
|
+
/ f"debug_{conv_res.input.file.stem}"
|
|
226
|
+
)
|
|
227
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
228
|
+
|
|
229
|
+
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
|
|
230
|
+
image.save(str(out_file), format="png")
|
|
231
|
+
|
|
232
|
+
@abstractmethod
|
|
233
|
+
def __call__(
|
|
234
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
235
|
+
) -> Iterable[Page]:
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
@classmethod
|
|
239
|
+
@abstractmethod
|
|
240
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
|
241
|
+
pass
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Iterable, Sequence
|
|
5
|
+
from typing import Type
|
|
6
|
+
|
|
7
|
+
from docling.datamodel.base_models import Page, TableStructurePrediction
|
|
8
|
+
from docling.datamodel.document import ConversionResult
|
|
9
|
+
from docling.datamodel.pipeline_options import BaseTableStructureOptions
|
|
10
|
+
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseTableStructureModel(BasePageModel, BaseModelWithOptions, ABC):
|
|
14
|
+
"""Shared interface for table structure models."""
|
|
15
|
+
|
|
16
|
+
enabled: bool
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def get_options_type(cls) -> Type[BaseTableStructureOptions]:
|
|
21
|
+
"""Return the options type supported by this table model."""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def predict_tables(
|
|
25
|
+
self,
|
|
26
|
+
conv_res: ConversionResult,
|
|
27
|
+
pages: Sequence[Page],
|
|
28
|
+
) -> Sequence[TableStructurePrediction]:
|
|
29
|
+
"""Produce table structure predictions for the provided pages."""
|
|
30
|
+
|
|
31
|
+
def __call__(
|
|
32
|
+
self,
|
|
33
|
+
conv_res: ConversionResult,
|
|
34
|
+
page_batch: Iterable[Page],
|
|
35
|
+
) -> Iterable[Page]:
|
|
36
|
+
if not getattr(self, "enabled", True):
|
|
37
|
+
yield from page_batch
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
pages = list(page_batch)
|
|
41
|
+
predictions = self.predict_tables(conv_res, pages)
|
|
42
|
+
|
|
43
|
+
for page, prediction in zip(pages, predictions):
|
|
44
|
+
page.predictions.tablestructure = prediction
|
|
45
|
+
yield page
|
|
File without changes
|