docling 2.23.1__py3-none-any.whl → 2.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,389 @@
1
+ import copy
2
+ import random
3
+ from pathlib import Path
4
+ from typing import Dict, List
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ DocumentOrigin,
13
+ GroupLabel,
14
+ NodeItem,
15
+ ProvenanceItem,
16
+ RefItem,
17
+ TableData,
18
+ )
19
+ from docling_core.types.doc.document import ContentLayer
20
+ from docling_core.types.legacy_doc.base import Ref
21
+ from docling_core.types.legacy_doc.document import BaseText
22
+ from docling_ibm_models.reading_order.reading_order_rb import (
23
+ PageElement as ReadingOrderPageElement,
24
+ )
25
+ from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
26
+ from PIL import ImageDraw
27
+ from pydantic import BaseModel, ConfigDict
28
+
29
+ from docling.datamodel.base_models import (
30
+ BasePageElement,
31
+ Cluster,
32
+ ContainerElement,
33
+ FigureElement,
34
+ Table,
35
+ TextElement,
36
+ )
37
+ from docling.datamodel.document import ConversionResult
38
+ from docling.datamodel.settings import settings
39
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
40
+
41
+
42
+ class ReadingOrderOptions(BaseModel):
43
+ model_config = ConfigDict(protected_namespaces=())
44
+
45
+ model_names: str = "" # e.g. "language;term;reference"
46
+
47
+
48
+ class ReadingOrderModel:
49
+ def __init__(self, options: ReadingOrderOptions):
50
+ self.options = options
51
+ self.ro_model = ReadingOrderPredictor()
52
+
53
+ def _assembled_to_readingorder_elements(
54
+ self, conv_res: ConversionResult
55
+ ) -> List[ReadingOrderPageElement]:
56
+
57
+ elements: List[ReadingOrderPageElement] = []
58
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
59
+
60
+ for element in conv_res.assembled.elements:
61
+
62
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
63
+ bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
64
+ text = element.text or ""
65
+
66
+ elements.append(
67
+ ReadingOrderPageElement(
68
+ cid=len(elements),
69
+ ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
70
+ text=text,
71
+ page_no=element.page_no,
72
+ page_size=page_no_to_pages[element.page_no].size,
73
+ label=element.label,
74
+ l=bbox.l,
75
+ r=bbox.r,
76
+ b=bbox.b,
77
+ t=bbox.t,
78
+ coord_origin=bbox.coord_origin,
79
+ )
80
+ )
81
+
82
+ return elements
83
+
84
+ def _add_child_elements(
85
+ self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
86
+ ):
87
+
88
+ child: Cluster
89
+ for child in element.cluster.children:
90
+ c_label = child.label
91
+ c_bbox = child.bbox.to_bottom_left_origin(
92
+ doc.pages[element.page_no + 1].size.height
93
+ )
94
+ c_text = " ".join(
95
+ [
96
+ cell.text.replace("\x02", "-").strip()
97
+ for cell in child.cells
98
+ if len(cell.text.strip()) > 0
99
+ ]
100
+ )
101
+
102
+ c_prov = ProvenanceItem(
103
+ page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
104
+ )
105
+ if c_label == DocItemLabel.LIST_ITEM:
106
+ # TODO: Infer if this is a numbered or a bullet list item
107
+ doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
108
+ elif c_label == DocItemLabel.SECTION_HEADER:
109
+ doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
110
+ else:
111
+ doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
112
+
113
+ def _readingorder_elements_to_docling_doc(
114
+ self,
115
+ conv_res: ConversionResult,
116
+ ro_elements: List[ReadingOrderPageElement],
117
+ el_to_captions_mapping: Dict[int, List[int]],
118
+ el_to_footnotes_mapping: Dict[int, List[int]],
119
+ el_merges_mapping: Dict[int, List[int]],
120
+ ) -> DoclingDocument:
121
+
122
+ id_to_elem = {
123
+ RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
124
+ for elem in conv_res.assembled.elements
125
+ }
126
+ cid_to_rels = {rel.cid: rel for rel in ro_elements}
127
+
128
+ origin = DocumentOrigin(
129
+ mimetype="application/pdf",
130
+ filename=conv_res.input.file.name,
131
+ binary_hash=conv_res.input.document_hash,
132
+ )
133
+ doc_name = Path(origin.filename).stem
134
+ out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
135
+
136
+ for page in conv_res.pages:
137
+ page_no = page.page_no + 1
138
+ size = page.size
139
+
140
+ assert size is not None
141
+
142
+ out_doc.add_page(page_no=page_no, size=size)
143
+
144
+ current_list = None
145
+ skippable_cids = {
146
+ cid
147
+ for mapping in (
148
+ el_to_captions_mapping,
149
+ el_to_footnotes_mapping,
150
+ el_merges_mapping,
151
+ )
152
+ for lst in mapping.values()
153
+ for cid in lst
154
+ }
155
+
156
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
157
+
158
+ for rel in ro_elements:
159
+ if rel.cid in skippable_cids:
160
+ continue
161
+ element = id_to_elem[rel.ref.cref]
162
+
163
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
164
+
165
+ if isinstance(element, TextElement):
166
+ if element.label == DocItemLabel.CODE:
167
+ cap_text = element.text
168
+ prov = ProvenanceItem(
169
+ page_no=element.page_no + 1,
170
+ charspan=(0, len(cap_text)),
171
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
172
+ )
173
+ code_item = out_doc.add_code(text=cap_text, prov=prov)
174
+
175
+ if rel.cid in el_to_captions_mapping.keys():
176
+ for caption_cid in el_to_captions_mapping[rel.cid]:
177
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
178
+ new_cap_item = self._add_caption_or_footnote(
179
+ caption_elem, out_doc, code_item, page_height
180
+ )
181
+
182
+ code_item.captions.append(new_cap_item.get_ref())
183
+
184
+ if rel.cid in el_to_footnotes_mapping.keys():
185
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
186
+ footnote_elem = id_to_elem[
187
+ cid_to_rels[footnote_cid].ref.cref
188
+ ]
189
+ new_footnote_item = self._add_caption_or_footnote(
190
+ footnote_elem, out_doc, code_item, page_height
191
+ )
192
+
193
+ code_item.footnotes.append(new_footnote_item.get_ref())
194
+ else:
195
+
196
+ new_item, current_list = self._handle_text_element(
197
+ element, out_doc, current_list, page_height
198
+ )
199
+
200
+ if rel.cid in el_merges_mapping.keys():
201
+ for merged_cid in el_merges_mapping[rel.cid]:
202
+ merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
203
+
204
+ self._merge_elements(
205
+ element, merged_elem, new_item, page_height
206
+ )
207
+
208
+ elif isinstance(element, Table):
209
+
210
+ tbl_data = TableData(
211
+ num_rows=element.num_rows,
212
+ num_cols=element.num_cols,
213
+ table_cells=element.table_cells,
214
+ )
215
+
216
+ prov = ProvenanceItem(
217
+ page_no=element.page_no + 1,
218
+ charspan=(0, 0),
219
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
220
+ )
221
+
222
+ tbl = out_doc.add_table(
223
+ data=tbl_data, prov=prov, label=element.cluster.label
224
+ )
225
+
226
+ if rel.cid in el_to_captions_mapping.keys():
227
+ for caption_cid in el_to_captions_mapping[rel.cid]:
228
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
229
+ new_cap_item = self._add_caption_or_footnote(
230
+ caption_elem, out_doc, tbl, page_height
231
+ )
232
+
233
+ tbl.captions.append(new_cap_item.get_ref())
234
+
235
+ if rel.cid in el_to_footnotes_mapping.keys():
236
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
237
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
238
+ new_footnote_item = self._add_caption_or_footnote(
239
+ footnote_elem, out_doc, tbl, page_height
240
+ )
241
+
242
+ tbl.footnotes.append(new_footnote_item.get_ref())
243
+
244
+ # TODO: Consider adding children of Table.
245
+
246
+ elif isinstance(element, FigureElement):
247
+ cap_text = ""
248
+ prov = ProvenanceItem(
249
+ page_no=element.page_no + 1,
250
+ charspan=(0, len(cap_text)),
251
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
252
+ )
253
+ pic = out_doc.add_picture(prov=prov)
254
+
255
+ if rel.cid in el_to_captions_mapping.keys():
256
+ for caption_cid in el_to_captions_mapping[rel.cid]:
257
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
258
+ new_cap_item = self._add_caption_or_footnote(
259
+ caption_elem, out_doc, pic, page_height
260
+ )
261
+
262
+ pic.captions.append(new_cap_item.get_ref())
263
+
264
+ if rel.cid in el_to_footnotes_mapping.keys():
265
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
266
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
267
+ new_footnote_item = self._add_caption_or_footnote(
268
+ footnote_elem, out_doc, pic, page_height
269
+ )
270
+
271
+ pic.footnotes.append(new_footnote_item.get_ref())
272
+
273
+ self._add_child_elements(element, pic, out_doc)
274
+
275
+ elif isinstance(element, ContainerElement): # Form, KV region
276
+ label = element.label
277
+ group_label = GroupLabel.UNSPECIFIED
278
+ if label == DocItemLabel.FORM:
279
+ group_label = GroupLabel.FORM_AREA
280
+ elif label == DocItemLabel.KEY_VALUE_REGION:
281
+ group_label = GroupLabel.KEY_VALUE_AREA
282
+
283
+ container_el = out_doc.add_group(label=group_label)
284
+
285
+ self._add_child_elements(element, container_el, out_doc)
286
+
287
+ return out_doc
288
+
289
+ def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
290
+ assert isinstance(elem, TextElement)
291
+ text = elem.text
292
+ prov = ProvenanceItem(
293
+ page_no=elem.page_no + 1,
294
+ charspan=(0, len(text)),
295
+ bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
296
+ )
297
+ new_item = out_doc.add_text(
298
+ label=elem.label, text=text, prov=prov, parent=parent
299
+ )
300
+ return new_item
301
+
302
+ def _handle_text_element(self, element, out_doc, current_list, page_height):
303
+ cap_text = element.text
304
+
305
+ prov = ProvenanceItem(
306
+ page_no=element.page_no + 1,
307
+ charspan=(0, len(cap_text)),
308
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
309
+ )
310
+ label = element.label
311
+ if label == DocItemLabel.LIST_ITEM:
312
+ if current_list is None:
313
+ current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
314
+
315
+ # TODO: Infer if this is a numbered or a bullet list item
316
+ new_item = out_doc.add_list_item(
317
+ text=cap_text, enumerated=False, prov=prov, parent=current_list
318
+ )
319
+ elif label == DocItemLabel.SECTION_HEADER:
320
+ current_list = None
321
+
322
+ new_item = out_doc.add_heading(text=cap_text, prov=prov)
323
+ elif label == DocItemLabel.FORMULA:
324
+ current_list = None
325
+
326
+ new_item = out_doc.add_text(
327
+ label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
328
+ )
329
+ else:
330
+ current_list = None
331
+
332
+ content_layer = ContentLayer.BODY
333
+ if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
334
+ content_layer = ContentLayer.FURNITURE
335
+
336
+ new_item = out_doc.add_text(
337
+ label=element.label,
338
+ text=cap_text,
339
+ prov=prov,
340
+ content_layer=content_layer,
341
+ )
342
+ return new_item, current_list
343
+
344
+ def _merge_elements(self, element, merged_elem, new_item, page_height):
345
+ assert isinstance(
346
+ merged_elem, type(element)
347
+ ), "Merged element must be of same type as element."
348
+ assert (
349
+ merged_elem.label == new_item.label
350
+ ), "Labels of merged elements must match."
351
+ prov = ProvenanceItem(
352
+ page_no=element.page_no + 1,
353
+ charspan=(
354
+ len(new_item.text) + 1,
355
+ len(new_item.text) + 1 + len(merged_elem.text),
356
+ ),
357
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
358
+ )
359
+ new_item.text += f" {merged_elem.text}"
360
+ new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
361
+ new_item.prov.append(prov)
362
+
363
+ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
364
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
365
+ page_elements = self._assembled_to_readingorder_elements(conv_res)
366
+
367
+ # Apply reading order
368
+ sorted_elements = self.ro_model.predict_reading_order(
369
+ page_elements=page_elements
370
+ )
371
+ el_to_captions_mapping = self.ro_model.predict_to_captions(
372
+ sorted_elements=sorted_elements
373
+ )
374
+ el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
375
+ sorted_elements=sorted_elements
376
+ )
377
+ el_merges_mapping = self.ro_model.predict_merges(
378
+ sorted_elements=sorted_elements
379
+ )
380
+
381
+ docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
382
+ conv_res,
383
+ sorted_elements,
384
+ el_to_captions_mapping,
385
+ el_to_footnotes_mapping,
386
+ el_merges_mapping,
387
+ )
388
+
389
+ return docling_doc
@@ -27,7 +27,6 @@ from docling.models.document_picture_classifier import (
27
27
  DocumentPictureClassifier,
28
28
  DocumentPictureClassifierOptions,
29
29
  )
30
- from docling.models.ds_glm_model import GlmModel, GlmOptions
31
30
  from docling.models.easyocr_model import EasyOcrModel
32
31
  from docling.models.layout_model import LayoutModel
33
32
  from docling.models.ocr_mac_model import OcrMacModel
@@ -40,6 +39,7 @@ from docling.models.picture_description_api_model import PictureDescriptionApiMo
40
39
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
41
40
  from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
42
41
  from docling.models.rapid_ocr_model import RapidOcrModel
42
+ from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
43
43
  from docling.models.table_structure_model import TableStructureModel
44
44
  from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
45
45
  from docling.models.tesseract_ocr_model import TesseractOcrModel
@@ -76,7 +76,7 @@ class StandardPdfPipeline(PaginatedPipeline):
76
76
  or self.pipeline_options.generate_table_images
77
77
  )
78
78
 
79
- self.glm_model = GlmModel(options=GlmOptions())
79
+ self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
80
80
 
81
81
  if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
82
82
  raise RuntimeError(