docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,60 @@
1
+ def ocr_engines():
2
+ from docling.models.stages.ocr.auto_ocr_model import OcrAutoModel
3
+ from docling.models.stages.ocr.easyocr_model import EasyOcrModel
4
+ from docling.models.stages.ocr.ocr_mac_model import OcrMacModel
5
+ from docling.models.stages.ocr.rapid_ocr_model import RapidOcrModel
6
+ from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractOcrCliModel
7
+ from docling.models.stages.ocr.tesseract_ocr_model import TesseractOcrModel
8
+
9
+ return {
10
+ "ocr_engines": [
11
+ OcrAutoModel,
12
+ EasyOcrModel,
13
+ OcrMacModel,
14
+ RapidOcrModel,
15
+ TesseractOcrModel,
16
+ TesseractOcrCliModel,
17
+ ]
18
+ }
19
+
20
+
21
+ def picture_description():
22
+ from docling.models.stages.picture_description.picture_description_api_model import (
23
+ PictureDescriptionApiModel,
24
+ )
25
+ from docling.models.stages.picture_description.picture_description_vlm_model import (
26
+ PictureDescriptionVlmModel,
27
+ )
28
+
29
+ return {
30
+ "picture_description": [
31
+ PictureDescriptionVlmModel,
32
+ PictureDescriptionApiModel,
33
+ ]
34
+ }
35
+
36
+
37
+ def layout_engines():
38
+ from docling.experimental.models.table_crops_layout_model import (
39
+ TableCropsLayoutModel,
40
+ )
41
+ from docling.models.stages.layout.layout_model import LayoutModel
42
+
43
+ return {
44
+ "layout_engines": [
45
+ LayoutModel,
46
+ TableCropsLayoutModel,
47
+ ]
48
+ }
49
+
50
+
51
+ def table_structure_engines():
52
+ from docling.models.stages.table_structure.table_structure_model import (
53
+ TableStructureModel,
54
+ )
55
+
56
+ return {
57
+ "table_structure_engines": [
58
+ TableStructureModel,
59
+ ]
60
+ }
File without changes
File without changes
@@ -0,0 +1,342 @@
1
+ import re
2
+ from collections.abc import Iterable
3
+ from pathlib import Path
4
+ from typing import List, Literal, Optional, Tuple, Union
5
+
6
+ import numpy as np
7
+ from docling_core.types.doc import (
8
+ CodeItem,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ NodeItem,
12
+ TextItem,
13
+ )
14
+ from docling_core.types.doc.labels import CodeLanguageLabel
15
+ from PIL import Image
16
+ from pydantic import BaseModel
17
+
18
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
19
+ from docling.datamodel.base_models import ItemAndImageEnrichmentElement
20
+ from docling.models.base_model import BaseItemAndImageEnrichmentModel
21
+ from docling.models.utils.hf_model_download import download_hf_model
22
+ from docling.utils.accelerator_utils import decide_device
23
+
24
+
25
+ class CodeFormulaModelOptions(BaseModel):
26
+ """
27
+ Configuration options for the CodeFormulaModel.
28
+
29
+ Attributes
30
+ ----------
31
+ kind : str
32
+ Type of the model. Fixed value "code_formula".
33
+ do_code_enrichment : bool
34
+ True if code enrichment is enabled, False otherwise.
35
+ do_formula_enrichment : bool
36
+ True if formula enrichment is enabled, False otherwise.
37
+ """
38
+
39
+ kind: Literal["code_formula"] = "code_formula"
40
+ do_code_enrichment: bool = True
41
+ do_formula_enrichment: bool = True
42
+
43
+
44
+ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
45
+ """
46
+ Model for processing and enriching documents with code and formula predictions.
47
+
48
+ Attributes
49
+ ----------
50
+ enabled : bool
51
+ True if the model is enabled, False otherwise.
52
+ options : CodeFormulaModelOptions
53
+ Configuration options for the CodeFormulaModel.
54
+ code_formula_model : CodeFormulaPredictor
55
+ The predictor model for code and formula processing.
56
+
57
+ Methods
58
+ -------
59
+ __init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
60
+ Initializes the CodeFormulaModel with the given configuration options.
61
+ is_processable(self, doc, element)
62
+ Determines if a given element in a document can be processed by the model.
63
+ __call__(self, doc, element_batch)
64
+ Processes the given batch of elements and enriches them with predictions.
65
+ """
66
+
67
+ _model_repo_folder = "docling-project--CodeFormulaV2"
68
+ elements_batch_size = 5
69
+ images_scale = 1.67 # = 120 dpi, aligned with training data resolution
70
+ expansion_factor = 0.18
71
+
72
+ def __init__(
73
+ self,
74
+ enabled: bool,
75
+ artifacts_path: Optional[Path],
76
+ options: CodeFormulaModelOptions,
77
+ accelerator_options: AcceleratorOptions,
78
+ ):
79
+ """
80
+ Initializes the CodeFormulaModel with the given configuration.
81
+
82
+ Parameters
83
+ ----------
84
+ enabled : bool
85
+ True if the model is enabled, False otherwise.
86
+ artifacts_path : Path
87
+ Path to the directory containing the model artifacts.
88
+ options : CodeFormulaModelOptions
89
+ Configuration options for the model.
90
+ accelerator_options : AcceleratorOptions
91
+ Options specifying the device and number of threads for acceleration.
92
+ """
93
+ self.enabled = enabled
94
+ self.options = options
95
+
96
+ if self.enabled:
97
+ self.device = decide_device(
98
+ accelerator_options.device,
99
+ supported_devices=[
100
+ AcceleratorDevice.CPU,
101
+ AcceleratorDevice.CUDA,
102
+ AcceleratorDevice.XPU,
103
+ ],
104
+ )
105
+
106
+ if artifacts_path is None:
107
+ artifacts_path = self.download_models()
108
+ else:
109
+ artifacts_path = artifacts_path / self._model_repo_folder
110
+
111
+ from transformers import AutoModelForImageTextToText, AutoProcessor
112
+
113
+ self._processor = AutoProcessor.from_pretrained(
114
+ artifacts_path,
115
+ )
116
+ self._model_max_length = self._processor.tokenizer.model_max_length
117
+ self._model = AutoModelForImageTextToText.from_pretrained(
118
+ artifacts_path, device_map=self.device
119
+ )
120
+ self._model.eval()
121
+
122
+ @staticmethod
123
+ def download_models(
124
+ local_dir: Optional[Path] = None,
125
+ force: bool = False,
126
+ progress: bool = False,
127
+ ) -> Path:
128
+ return download_hf_model(
129
+ repo_id="docling-project/CodeFormulaV2",
130
+ revision="main",
131
+ local_dir=local_dir,
132
+ force=force,
133
+ progress=progress,
134
+ )
135
+
136
+ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
137
+ """
138
+ Determines if a given element in a document can be processed by the model.
139
+
140
+ Parameters
141
+ ----------
142
+ doc : DoclingDocument
143
+ The document being processed.
144
+ element : NodeItem
145
+ The element within the document to check.
146
+
147
+ Returns
148
+ -------
149
+ bool
150
+ True if the element can be processed, False otherwise.
151
+ """
152
+ return self.enabled and (
153
+ (isinstance(element, CodeItem) and self.options.do_code_enrichment)
154
+ or (
155
+ isinstance(element, TextItem)
156
+ and element.label == DocItemLabel.FORMULA
157
+ and self.options.do_formula_enrichment
158
+ )
159
+ )
160
+
161
+ def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
162
+ """Extracts a programming language from the beginning of a string.
163
+
164
+ This function checks if the input string starts with a pattern of the form
165
+ ``<_some_language_>``. If it does, it extracts the language string and returns
166
+ a tuple of (remainder, language). Otherwise, it returns the original string
167
+ and `None`.
168
+
169
+ Args:
170
+ input_string (str): The input string, which may start with ``<_language_>``.
171
+
172
+ Returns:
173
+ Tuple[str, Optional[str]]:
174
+ A tuple where:
175
+ - The first element is either:
176
+ - The remainder of the string (everything after ``<_language_>``),
177
+ if a match is found; or
178
+ - The original string, if no match is found.
179
+ - The second element is the extracted language if a match is found;
180
+ otherwise, `None`.
181
+ """
182
+ pattern = r"^<_([^_>]+)_>\s*(.*)"
183
+ match = re.match(pattern, input_string, flags=re.DOTALL)
184
+ if match:
185
+ language = str(match.group(1)) # the captured programming language
186
+ remainder = str(match.group(2)) # everything after the <_language_>
187
+ return remainder, language
188
+ else:
189
+ return input_string, None
190
+
191
+ def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
192
+ """
193
+ Converts a string to a corresponding `CodeLanguageLabel` enum member.
194
+
195
+ If the provided string does not match any value in `CodeLanguageLabel`,
196
+ it defaults to `CodeLanguageLabel.UNKNOWN`.
197
+
198
+ Args:
199
+ value (Optional[str]): The string representation of the code language or None.
200
+
201
+ Returns:
202
+ CodeLanguageLabel: The corresponding enum member if the value is valid,
203
+ otherwise `CodeLanguageLabel.UNKNOWN`.
204
+ """
205
+ if not isinstance(value, str):
206
+ return CodeLanguageLabel.UNKNOWN
207
+
208
+ try:
209
+ return CodeLanguageLabel(value)
210
+ except ValueError:
211
+ return CodeLanguageLabel.UNKNOWN
212
+
213
+ def _get_prompt(self, label: str) -> str:
214
+ """
215
+ Constructs the prompt for the model based on the input label.
216
+
217
+ Parameters
218
+ ----------
219
+ label : str
220
+ The type of input, either 'code' or 'formula'.
221
+
222
+ Returns
223
+ -------
224
+ str
225
+ The constructed prompt including necessary tokens and query.
226
+
227
+ Raises
228
+ ------
229
+ NotImplementedError
230
+ If the label is not 'code' or 'formula'.
231
+ """
232
+ if label == "code":
233
+ query = "<code>"
234
+ elif label == "formula":
235
+ query = "<formula>"
236
+ else:
237
+ raise NotImplementedError("Label must be either code or formula")
238
+
239
+ messages = [
240
+ {
241
+ "role": "user",
242
+ "content": [{"type": "image"}, {"type": "text", "text": query}],
243
+ },
244
+ ]
245
+
246
+ prompt = self._processor.apply_chat_template(
247
+ messages, add_generation_prompt=True
248
+ )
249
+
250
+ return prompt
251
+
252
+ def _post_process(self, texts: list[str]) -> list[str]:
253
+ """
254
+ Processes a list of text strings by truncating at '<end_of_utterance>' and
255
+ removing a predefined set of unwanted substrings.
256
+
257
+ Parameters
258
+ ----------
259
+ texts : list[str]
260
+ A list of strings to be post-processed.
261
+
262
+ Returns
263
+ -------
264
+ list[str]
265
+ A list of cleaned strings with specified substrings removed and truncated at
266
+ '<end_of_utterance>' if present.
267
+ """
268
+ to_remove = ["</code>", "</formula>", "<loc_0><loc_0><loc_500><loc_500>"]
269
+
270
+ def clean_text(text: str) -> str:
271
+ idx = text.find("<end_of_utterance>")
272
+ if idx != -1:
273
+ text = text[:idx]
274
+
275
+ for token in to_remove:
276
+ if token in text:
277
+ text = text.replace(token, "")
278
+ return text.lstrip()
279
+
280
+ return [clean_text(t) for t in texts]
281
+
282
+ def __call__(
283
+ self,
284
+ doc: DoclingDocument,
285
+ element_batch: Iterable[ItemAndImageEnrichmentElement],
286
+ ) -> Iterable[NodeItem]:
287
+ """
288
+ Processes the given batch of elements and enriches them with predictions.
289
+
290
+ Parameters
291
+ ----------
292
+ doc : DoclingDocument
293
+ The document being processed.
294
+ element_batch : Iterable[ItemAndImageEnrichmentElement]
295
+ A batch of elements to be processed.
296
+
297
+ Returns
298
+ -------
299
+ Iterable[Any]
300
+ An iterable of enriched elements.
301
+ """
302
+ if not self.enabled:
303
+ for element in element_batch:
304
+ yield element.item
305
+ return
306
+
307
+ labels: List[str] = []
308
+ images: List[Union[Image.Image, np.ndarray]] = []
309
+ elements: List[TextItem] = []
310
+ for el in element_batch:
311
+ elements.append(el.item) # type: ignore[arg-type]
312
+ labels.append(el.item.label) # type: ignore[attr-defined]
313
+ images.append(el.image)
314
+
315
+ prompts = [self._get_prompt(label) for label in labels]
316
+ inputs = self._processor(
317
+ text=prompts,
318
+ images=images,
319
+ return_tensors="pt",
320
+ )
321
+ inputs = inputs.to(self.device)
322
+
323
+ gen_kwargs = dict(
324
+ max_new_tokens=self._model_max_length - inputs.input_ids.shape[1],
325
+ use_cache=True,
326
+ do_sample=False,
327
+ )
328
+
329
+ generated_ids = self._model.generate(**inputs, **gen_kwargs)
330
+
331
+ outputs = self._processor.batch_decode(
332
+ generated_ids[:, inputs.input_ids.shape[1] :], skip_special_tokens=False
333
+ )
334
+ outputs = self._post_process(outputs)
335
+
336
+ for item, output in zip(elements, outputs):
337
+ if isinstance(item, CodeItem):
338
+ output, code_language = self._extract_code_language(output)
339
+ item.code_language = self._get_code_language_enum(code_language)
340
+ item.text = output
341
+
342
+ yield item
File without changes
@@ -0,0 +1,249 @@
1
+ import copy
2
+ import logging
3
+ import warnings
4
+ from collections.abc import Sequence
5
+ from pathlib import Path
6
+ from typing import List, Optional, Union
7
+
8
+ import numpy as np
9
+ from docling_core.types.doc import DocItemLabel
10
+ from PIL import Image
11
+
12
+ from docling.datamodel.accelerator_options import AcceleratorOptions
13
+ from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
14
+ from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
16
+ from docling.datamodel.pipeline_options import LayoutOptions
17
+ from docling.datamodel.settings import settings
18
+ from docling.models.base_layout_model import BaseLayoutModel
19
+ from docling.models.utils.hf_model_download import download_hf_model
20
+ from docling.utils.accelerator_utils import decide_device
21
+ from docling.utils.layout_postprocessor import LayoutPostprocessor
22
+ from docling.utils.profiling import TimeRecorder
23
+ from docling.utils.visualization import draw_clusters
24
+
25
+ _log = logging.getLogger(__name__)
26
+
27
+
28
+ class LayoutModel(BaseLayoutModel):
29
+ TEXT_ELEM_LABELS = [
30
+ DocItemLabel.TEXT,
31
+ DocItemLabel.FOOTNOTE,
32
+ DocItemLabel.CAPTION,
33
+ DocItemLabel.CHECKBOX_UNSELECTED,
34
+ DocItemLabel.CHECKBOX_SELECTED,
35
+ DocItemLabel.SECTION_HEADER,
36
+ DocItemLabel.PAGE_HEADER,
37
+ DocItemLabel.PAGE_FOOTER,
38
+ DocItemLabel.CODE,
39
+ DocItemLabel.LIST_ITEM,
40
+ DocItemLabel.FORMULA,
41
+ ]
42
+ PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
43
+
44
+ TABLE_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
45
+ FIGURE_LABEL = DocItemLabel.PICTURE
46
+ FORMULA_LABEL = DocItemLabel.FORMULA
47
+ CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
48
+
49
+ def __init__(
50
+ self,
51
+ artifacts_path: Optional[Path],
52
+ accelerator_options: AcceleratorOptions,
53
+ options: LayoutOptions,
54
+ ):
55
+ from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
56
+
57
+ self.options = options
58
+
59
+ device = decide_device(accelerator_options.device)
60
+ layout_model_config = options.model_spec
61
+ model_repo_folder = layout_model_config.model_repo_folder
62
+ model_path = layout_model_config.model_path
63
+
64
+ if artifacts_path is None:
65
+ artifacts_path = (
66
+ self.download_models(layout_model_config=layout_model_config)
67
+ / model_path
68
+ )
69
+ else:
70
+ if (artifacts_path / model_repo_folder).exists():
71
+ artifacts_path = artifacts_path / model_repo_folder / model_path
72
+ elif (artifacts_path / model_path).exists():
73
+ warnings.warn(
74
+ "The usage of artifacts_path containing directly "
75
+ f"{model_path} is deprecated. Please point "
76
+ "the artifacts_path to the parent containing "
77
+ f"the {model_repo_folder} folder.",
78
+ DeprecationWarning,
79
+ stacklevel=3,
80
+ )
81
+ artifacts_path = artifacts_path / model_path
82
+
83
+ self.layout_predictor = LayoutPredictor(
84
+ artifact_path=str(artifacts_path),
85
+ device=device,
86
+ num_threads=accelerator_options.num_threads,
87
+ )
88
+
89
+ @classmethod
90
+ def get_options_type(cls) -> type[LayoutOptions]:
91
+ return LayoutOptions
92
+
93
+ @staticmethod
94
+ def download_models(
95
+ local_dir: Optional[Path] = None,
96
+ force: bool = False,
97
+ progress: bool = False,
98
+ layout_model_config: LayoutModelConfig = LayoutOptions().model_spec, # use default
99
+ ) -> Path:
100
+ return download_hf_model(
101
+ repo_id=layout_model_config.repo_id,
102
+ revision=layout_model_config.revision,
103
+ local_dir=local_dir,
104
+ force=force,
105
+ progress=progress,
106
+ )
107
+
108
+ def draw_clusters_and_cells_side_by_side(
109
+ self, conv_res, page, clusters, mode_prefix: str, show: bool = False
110
+ ):
111
+ """
112
+ Draws a page image side by side with clusters filtered into two categories:
113
+ - Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE.
114
+ - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
115
+ Includes label names and confidence scores for each cluster.
116
+ """
117
+ scale_x = page.image.width / page.size.width
118
+ scale_y = page.image.height / page.size.height
119
+
120
+ # Filter clusters for left and right images
121
+ exclude_labels = {
122
+ DocItemLabel.FORM,
123
+ DocItemLabel.KEY_VALUE_REGION,
124
+ DocItemLabel.PICTURE,
125
+ }
126
+ left_clusters = [c for c in clusters if c.label not in exclude_labels]
127
+ right_clusters = [c for c in clusters if c.label in exclude_labels]
128
+ # Create a deep copy of the original image for both sides
129
+ left_image = page.image.copy()
130
+ right_image = page.image.copy()
131
+
132
+ # Draw clusters on both images
133
+ draw_clusters(left_image, left_clusters, scale_x, scale_y)
134
+ draw_clusters(right_image, right_clusters, scale_x, scale_y)
135
+ # Combine the images side by side
136
+ combined_width = left_image.width * 2
137
+ combined_height = left_image.height
138
+ combined_image = Image.new("RGB", (combined_width, combined_height))
139
+ combined_image.paste(left_image, (0, 0))
140
+ combined_image.paste(right_image, (left_image.width, 0))
141
+ if show:
142
+ combined_image.show()
143
+ else:
144
+ out_path: Path = (
145
+ Path(settings.debug.debug_output_path)
146
+ / f"debug_{conv_res.input.file.stem}"
147
+ )
148
+ out_path.mkdir(parents=True, exist_ok=True)
149
+ out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
150
+ combined_image.save(str(out_file), format="png")
151
+
152
+ def predict_layout(
153
+ self,
154
+ conv_res: ConversionResult,
155
+ pages: Sequence[Page],
156
+ ) -> Sequence[LayoutPrediction]:
157
+ # Convert to list to ensure predictable iteration
158
+ pages = list(pages)
159
+
160
+ # Separate valid and invalid pages
161
+ valid_pages = []
162
+ valid_page_images: List[Union[Image.Image, np.ndarray]] = []
163
+
164
+ for page in pages:
165
+ assert page._backend is not None
166
+ if not page._backend.is_valid():
167
+ continue
168
+
169
+ assert page.size is not None
170
+ page_image = page.get_image(scale=1.0)
171
+ assert page_image is not None
172
+
173
+ valid_pages.append(page)
174
+ valid_page_images.append(page_image)
175
+
176
+ # Process all valid pages with batch prediction
177
+ batch_predictions = []
178
+ if valid_page_images:
179
+ with TimeRecorder(conv_res, "layout"):
180
+ batch_predictions = self.layout_predictor.predict_batch( # type: ignore[attr-defined]
181
+ valid_page_images
182
+ )
183
+
184
+ # Process each page with its predictions
185
+ layout_predictions: list[LayoutPrediction] = []
186
+ valid_page_idx = 0
187
+ for page in pages:
188
+ assert page._backend is not None
189
+ if not page._backend.is_valid():
190
+ existing_prediction = page.predictions.layout or LayoutPrediction()
191
+ page.predictions.layout = existing_prediction
192
+ layout_predictions.append(existing_prediction)
193
+ continue
194
+
195
+ page_predictions = batch_predictions[valid_page_idx]
196
+ valid_page_idx += 1
197
+
198
+ clusters = []
199
+ for ix, pred_item in enumerate(page_predictions):
200
+ label = DocItemLabel(
201
+ pred_item["label"].lower().replace(" ", "_").replace("-", "_")
202
+ ) # Temporary, until docling-ibm-model uses docling-core types
203
+ cluster = Cluster(
204
+ id=ix,
205
+ label=label,
206
+ confidence=pred_item["confidence"],
207
+ bbox=BoundingBox.model_validate(pred_item),
208
+ cells=[],
209
+ )
210
+ clusters.append(cluster)
211
+
212
+ if settings.debug.visualize_raw_layout:
213
+ self.draw_clusters_and_cells_side_by_side(
214
+ conv_res, page, clusters, mode_prefix="raw"
215
+ )
216
+
217
+ # Apply postprocessing
218
+ processed_clusters, processed_cells = LayoutPostprocessor(
219
+ page, clusters, self.options
220
+ ).postprocess()
221
+ # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
222
+
223
+ with warnings.catch_warnings():
224
+ warnings.filterwarnings(
225
+ "ignore",
226
+ "Mean of empty slice|invalid value encountered in scalar divide",
227
+ RuntimeWarning,
228
+ "numpy",
229
+ )
230
+
231
+ conv_res.confidence.pages[page.page_no].layout_score = float(
232
+ np.mean([c.confidence for c in processed_clusters])
233
+ )
234
+
235
+ conv_res.confidence.pages[page.page_no].ocr_score = float(
236
+ np.mean([c.confidence for c in processed_cells if c.from_ocr])
237
+ )
238
+
239
+ prediction = LayoutPrediction(clusters=processed_clusters)
240
+ page.predictions.layout = prediction
241
+
242
+ if settings.debug.visualize_layout:
243
+ self.draw_clusters_and_cells_side_by_side(
244
+ conv_res, page, processed_clusters, mode_prefix="postprocessed"
245
+ )
246
+
247
+ layout_predictions.append(prediction)
248
+
249
+ return layout_predictions
File without changes