docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,431 @@
1
+ from pathlib import Path
2
+
3
+ from docling_core.types.doc import (
4
+ DocItemLabel,
5
+ DoclingDocument,
6
+ DocumentOrigin,
7
+ GroupLabel,
8
+ NodeItem,
9
+ ProvenanceItem,
10
+ RefItem,
11
+ RichTableCell,
12
+ TableData,
13
+ )
14
+ from docling_core.types.doc.document import ContentLayer
15
+ from docling_ibm_models.list_item_normalizer.list_marker_processor import (
16
+ ListItemMarkerProcessor,
17
+ )
18
+ from docling_ibm_models.reading_order.reading_order_rb import (
19
+ PageElement as ReadingOrderPageElement,
20
+ ReadingOrderPredictor,
21
+ )
22
+ from pydantic import BaseModel, ConfigDict
23
+
24
+ from docling.datamodel.base_models import (
25
+ BasePageElement,
26
+ Cluster,
27
+ ContainerElement,
28
+ FigureElement,
29
+ Table,
30
+ TextElement,
31
+ )
32
+ from docling.datamodel.document import ConversionResult
33
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
34
+
35
+
36
+ class ReadingOrderOptions(BaseModel):
37
+ model_config = ConfigDict(protected_namespaces=())
38
+
39
+ model_names: str = "" # e.g. "language;term;reference"
40
+
41
+
42
+ class ReadingOrderModel:
43
+ def __init__(self, options: ReadingOrderOptions):
44
+ self.options = options
45
+ self.ro_model = ReadingOrderPredictor()
46
+ self.list_item_processor = ListItemMarkerProcessor()
47
+
48
+ def _assembled_to_readingorder_elements(
49
+ self, conv_res: ConversionResult
50
+ ) -> list[ReadingOrderPageElement]:
51
+ elements: list[ReadingOrderPageElement] = []
52
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
53
+
54
+ for element in conv_res.assembled.elements:
55
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
56
+ bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
57
+ text = element.text or ""
58
+
59
+ elements.append(
60
+ ReadingOrderPageElement(
61
+ cid=len(elements),
62
+ ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
63
+ text=text,
64
+ page_no=element.page_no,
65
+ page_size=page_no_to_pages[element.page_no].size,
66
+ label=element.label,
67
+ l=bbox.l,
68
+ r=bbox.r,
69
+ b=bbox.b,
70
+ t=bbox.t,
71
+ coord_origin=bbox.coord_origin,
72
+ )
73
+ )
74
+
75
+ return elements
76
+
77
+ def _add_child_elements(
78
+ self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
79
+ ):
80
+ child: Cluster
81
+ for child in element.cluster.children:
82
+ c_label = child.label
83
+ c_bbox = child.bbox.to_bottom_left_origin(
84
+ doc.pages[element.page_no].size.height
85
+ )
86
+ c_text = " ".join(
87
+ [
88
+ cell.text.replace("\x02", "-").strip()
89
+ for cell in child.cells
90
+ if len(cell.text.strip()) > 0
91
+ ]
92
+ )
93
+
94
+ c_prov = ProvenanceItem(
95
+ page_no=element.page_no, charspan=(0, len(c_text)), bbox=c_bbox
96
+ )
97
+ if c_label == DocItemLabel.LIST_ITEM:
98
+ # TODO: Infer if this is a numbered or a bullet list item
99
+ l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
100
+ self.list_item_processor.process_list_item(l_item)
101
+ elif c_label == DocItemLabel.SECTION_HEADER:
102
+ doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
103
+ else:
104
+ doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
105
+
106
+ def _create_rich_cell_group(
107
+ self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
108
+ ) -> RefItem:
109
+ """Create a group containing all child elements for a rich table cell."""
110
+ group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
111
+ group_element = doc.add_group(
112
+ label=GroupLabel.UNSPECIFIED,
113
+ name=group_name,
114
+ parent=table_item,
115
+ )
116
+
117
+ # Add all child elements to the group
118
+ self._add_child_elements(element, group_element, doc)
119
+
120
+ return group_element.get_ref()
121
+
122
+ def _readingorder_elements_to_docling_doc(
123
+ self,
124
+ conv_res: ConversionResult,
125
+ ro_elements: list[ReadingOrderPageElement],
126
+ el_to_captions_mapping: dict[int, list[int]],
127
+ el_to_footnotes_mapping: dict[int, list[int]],
128
+ el_merges_mapping: dict[int, list[int]],
129
+ ) -> DoclingDocument:
130
+ id_to_elem = {
131
+ RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
132
+ for elem in conv_res.assembled.elements
133
+ }
134
+ cid_to_rels = {rel.cid: rel for rel in ro_elements}
135
+
136
+ origin = DocumentOrigin(
137
+ mimetype="application/pdf",
138
+ filename=conv_res.input.file.name,
139
+ binary_hash=conv_res.input.document_hash,
140
+ )
141
+ doc_name = Path(origin.filename).stem
142
+ out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
143
+
144
+ for page in conv_res.pages:
145
+ page_no = page.page_no
146
+ size = page.size
147
+
148
+ assert size is not None, "Page size is not initialized."
149
+
150
+ out_doc.add_page(page_no=page_no, size=size)
151
+
152
+ current_list = None
153
+ skippable_cids = {
154
+ cid
155
+ for mapping in (
156
+ el_to_captions_mapping,
157
+ el_to_footnotes_mapping,
158
+ el_merges_mapping,
159
+ )
160
+ for lst in mapping.values()
161
+ for cid in lst
162
+ }
163
+
164
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
165
+
166
+ for rel in ro_elements:
167
+ if rel.cid in skippable_cids:
168
+ continue
169
+ element = id_to_elem[rel.ref.cref]
170
+
171
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
172
+
173
+ if isinstance(element, TextElement):
174
+ if element.label == DocItemLabel.CODE:
175
+ cap_text = element.text
176
+ prov = ProvenanceItem(
177
+ page_no=element.page_no,
178
+ charspan=(0, len(cap_text)),
179
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
180
+ )
181
+ code_item = out_doc.add_code(text=cap_text, prov=prov)
182
+
183
+ if rel.cid in el_to_captions_mapping.keys():
184
+ for caption_cid in el_to_captions_mapping[rel.cid]:
185
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
186
+ new_cap_item = self._add_caption_or_footnote(
187
+ caption_elem, out_doc, code_item, page_height
188
+ )
189
+
190
+ code_item.captions.append(new_cap_item.get_ref())
191
+
192
+ if rel.cid in el_to_footnotes_mapping.keys():
193
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
194
+ footnote_elem = id_to_elem[
195
+ cid_to_rels[footnote_cid].ref.cref
196
+ ]
197
+ new_footnote_item = self._add_caption_or_footnote(
198
+ footnote_elem, out_doc, code_item, page_height
199
+ )
200
+
201
+ code_item.footnotes.append(new_footnote_item.get_ref())
202
+ else:
203
+ new_item, current_list = self._handle_text_element(
204
+ element, out_doc, current_list, page_height
205
+ )
206
+
207
+ if rel.cid in el_merges_mapping.keys():
208
+ for merged_cid in el_merges_mapping[rel.cid]:
209
+ merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
210
+
211
+ self._merge_elements(
212
+ element, merged_elem, new_item, page_height
213
+ )
214
+
215
+ elif isinstance(element, Table):
216
+ # Check if table has no structure prediction
217
+ if element.num_rows == 0 and element.num_cols == 0:
218
+ # Only create 1x1 table if there are children to put in it
219
+ if element.cluster.children:
220
+ # Create minimal 1x1 table with rich cell containing all children
221
+ tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
222
+ else:
223
+ # Create empty table with no structure
224
+ tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
225
+ else:
226
+ tbl_data = TableData(
227
+ num_rows=element.num_rows,
228
+ num_cols=element.num_cols,
229
+ table_cells=element.table_cells,
230
+ )
231
+
232
+ prov = ProvenanceItem(
233
+ page_no=element.page_no,
234
+ charspan=(0, 0),
235
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
236
+ )
237
+
238
+ tbl = out_doc.add_table(
239
+ data=tbl_data, prov=prov, label=element.cluster.label
240
+ )
241
+
242
+ if rel.cid in el_to_captions_mapping.keys():
243
+ for caption_cid in el_to_captions_mapping[rel.cid]:
244
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
245
+ new_cap_item = self._add_caption_or_footnote(
246
+ caption_elem, out_doc, tbl, page_height
247
+ )
248
+
249
+ tbl.captions.append(new_cap_item.get_ref())
250
+
251
+ if rel.cid in el_to_footnotes_mapping.keys():
252
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
253
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
254
+ new_footnote_item = self._add_caption_or_footnote(
255
+ footnote_elem, out_doc, tbl, page_height
256
+ )
257
+
258
+ tbl.footnotes.append(new_footnote_item.get_ref())
259
+
260
+ # Handle case where table has no structure prediction but has children
261
+ if (
262
+ element.num_rows == 0
263
+ and element.num_cols == 0
264
+ and element.cluster.children
265
+ ):
266
+ # Create rich cell containing all child elements
267
+ rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
268
+
269
+ # Create rich table cell spanning the entire 1x1 table
270
+ rich_cell = RichTableCell(
271
+ text="", # Empty text since content is in the group
272
+ row_span=1,
273
+ col_span=1,
274
+ start_row_offset_idx=0,
275
+ end_row_offset_idx=1,
276
+ start_col_offset_idx=0,
277
+ end_col_offset_idx=1,
278
+ column_header=False,
279
+ row_header=False,
280
+ ref=rich_cell_ref,
281
+ )
282
+ out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
283
+
284
+ # TODO: Consider adding children of Table.
285
+
286
+ elif isinstance(element, FigureElement):
287
+ cap_text = ""
288
+ prov = ProvenanceItem(
289
+ page_no=element.page_no,
290
+ charspan=(0, len(cap_text)),
291
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
292
+ )
293
+ pic = out_doc.add_picture(prov=prov)
294
+
295
+ if rel.cid in el_to_captions_mapping.keys():
296
+ for caption_cid in el_to_captions_mapping[rel.cid]:
297
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
298
+ new_cap_item = self._add_caption_or_footnote(
299
+ caption_elem, out_doc, pic, page_height
300
+ )
301
+
302
+ pic.captions.append(new_cap_item.get_ref())
303
+
304
+ if rel.cid in el_to_footnotes_mapping.keys():
305
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
306
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
307
+ new_footnote_item = self._add_caption_or_footnote(
308
+ footnote_elem, out_doc, pic, page_height
309
+ )
310
+
311
+ pic.footnotes.append(new_footnote_item.get_ref())
312
+
313
+ self._add_child_elements(element, pic, out_doc)
314
+
315
+ elif isinstance(element, ContainerElement): # Form, KV region
316
+ label = element.label
317
+ group_label = GroupLabel.UNSPECIFIED
318
+ if label == DocItemLabel.FORM:
319
+ group_label = GroupLabel.FORM_AREA
320
+ elif label == DocItemLabel.KEY_VALUE_REGION:
321
+ group_label = GroupLabel.KEY_VALUE_AREA
322
+
323
+ container_el = out_doc.add_group(label=group_label)
324
+
325
+ self._add_child_elements(element, container_el, out_doc)
326
+
327
+ return out_doc
328
+
329
+ def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
330
+ assert isinstance(elem, TextElement)
331
+ text = elem.text
332
+ prov = ProvenanceItem(
333
+ page_no=elem.page_no,
334
+ charspan=(0, len(text)),
335
+ bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
336
+ )
337
+ new_item = out_doc.add_text(
338
+ label=elem.label, text=text, prov=prov, parent=parent
339
+ )
340
+ return new_item
341
+
342
+ def _handle_text_element(self, element, out_doc, current_list, page_height):
343
+ cap_text = element.text
344
+
345
+ prov = ProvenanceItem(
346
+ page_no=element.page_no,
347
+ charspan=(0, len(cap_text)),
348
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
349
+ )
350
+ label = element.label
351
+ if label == DocItemLabel.LIST_ITEM:
352
+ if current_list is None:
353
+ current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
354
+
355
+ # TODO: Infer if this is a numbered or a bullet list item
356
+ new_item = out_doc.add_list_item(
357
+ text=cap_text, enumerated=False, prov=prov, parent=current_list
358
+ )
359
+ self.list_item_processor.process_list_item(new_item)
360
+
361
+ elif label == DocItemLabel.SECTION_HEADER:
362
+ current_list = None
363
+
364
+ new_item = out_doc.add_heading(text=cap_text, prov=prov)
365
+ elif label == DocItemLabel.FORMULA:
366
+ current_list = None
367
+
368
+ new_item = out_doc.add_text(
369
+ label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
370
+ )
371
+ else:
372
+ current_list = None
373
+
374
+ content_layer = ContentLayer.BODY
375
+ if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
376
+ content_layer = ContentLayer.FURNITURE
377
+
378
+ new_item = out_doc.add_text(
379
+ label=element.label,
380
+ text=cap_text,
381
+ prov=prov,
382
+ content_layer=content_layer,
383
+ )
384
+ return new_item, current_list
385
+
386
+ def _merge_elements(self, element, merged_elem, new_item, page_height):
387
+ assert isinstance(merged_elem, type(element)), (
388
+ "Merged element must be of same type as element."
389
+ )
390
+ assert merged_elem.label == new_item.label, (
391
+ "Labels of merged elements must match."
392
+ )
393
+ prov = ProvenanceItem(
394
+ page_no=merged_elem.page_no,
395
+ charspan=(
396
+ len(new_item.text) + 1,
397
+ len(new_item.text) + 1 + len(merged_elem.text),
398
+ ),
399
+ bbox=merged_elem.cluster.bbox.to_bottom_left_origin(page_height),
400
+ )
401
+ new_item.text += f" {merged_elem.text}"
402
+ new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
403
+ new_item.prov.append(prov)
404
+
405
+ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
406
+ with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
407
+ page_elements = self._assembled_to_readingorder_elements(conv_res)
408
+
409
+ # Apply reading order
410
+ sorted_elements = self.ro_model.predict_reading_order(
411
+ page_elements=page_elements
412
+ )
413
+ el_to_captions_mapping = self.ro_model.predict_to_captions(
414
+ sorted_elements=sorted_elements
415
+ )
416
+ el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
417
+ sorted_elements=sorted_elements
418
+ )
419
+ el_merges_mapping = self.ro_model.predict_merges(
420
+ sorted_elements=sorted_elements
421
+ )
422
+
423
+ docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
424
+ conv_res,
425
+ sorted_elements,
426
+ el_to_captions_mapping,
427
+ el_to_footnotes_mapping,
428
+ el_merges_mapping,
429
+ )
430
+
431
+ return docling_doc
File without changes