docling 2.23.1__tar.gz → 2.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {docling-2.23.1 → docling-2.24.0}/PKG-INFO +2 -3
  2. {docling-2.23.1 → docling-2.24.0}/docling/models/page_assemble_model.py +8 -0
  3. docling-2.24.0/docling/models/readingorder_model.py +389 -0
  4. {docling-2.23.1 → docling-2.24.0}/docling/pipeline/standard_pdf_pipeline.py +2 -2
  5. {docling-2.23.1 → docling-2.24.0}/pyproject.toml +2 -4
  6. docling-2.23.1/docling/models/ds_glm_model.py +0 -386
  7. {docling-2.23.1 → docling-2.24.0}/LICENSE +0 -0
  8. {docling-2.23.1 → docling-2.24.0}/README.md +0 -0
  9. {docling-2.23.1 → docling-2.24.0}/docling/__init__.py +0 -0
  10. {docling-2.23.1 → docling-2.24.0}/docling/backend/__init__.py +0 -0
  11. {docling-2.23.1 → docling-2.24.0}/docling/backend/abstract_backend.py +0 -0
  12. {docling-2.23.1 → docling-2.24.0}/docling/backend/asciidoc_backend.py +0 -0
  13. {docling-2.23.1 → docling-2.24.0}/docling/backend/csv_backend.py +0 -0
  14. {docling-2.23.1 → docling-2.24.0}/docling/backend/docling_parse_backend.py +0 -0
  15. {docling-2.23.1 → docling-2.24.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  16. {docling-2.23.1 → docling-2.24.0}/docling/backend/html_backend.py +0 -0
  17. {docling-2.23.1 → docling-2.24.0}/docling/backend/json/__init__.py +0 -0
  18. {docling-2.23.1 → docling-2.24.0}/docling/backend/json/docling_json_backend.py +0 -0
  19. {docling-2.23.1 → docling-2.24.0}/docling/backend/md_backend.py +0 -0
  20. {docling-2.23.1 → docling-2.24.0}/docling/backend/msexcel_backend.py +0 -0
  21. {docling-2.23.1 → docling-2.24.0}/docling/backend/mspowerpoint_backend.py +0 -0
  22. {docling-2.23.1 → docling-2.24.0}/docling/backend/msword_backend.py +0 -0
  23. {docling-2.23.1 → docling-2.24.0}/docling/backend/pdf_backend.py +0 -0
  24. {docling-2.23.1 → docling-2.24.0}/docling/backend/pypdfium2_backend.py +0 -0
  25. {docling-2.23.1 → docling-2.24.0}/docling/backend/xml/__init__.py +0 -0
  26. {docling-2.23.1 → docling-2.24.0}/docling/backend/xml/jats_backend.py +0 -0
  27. {docling-2.23.1 → docling-2.24.0}/docling/backend/xml/uspto_backend.py +0 -0
  28. {docling-2.23.1 → docling-2.24.0}/docling/chunking/__init__.py +0 -0
  29. {docling-2.23.1 → docling-2.24.0}/docling/cli/__init__.py +0 -0
  30. {docling-2.23.1 → docling-2.24.0}/docling/cli/main.py +0 -0
  31. {docling-2.23.1 → docling-2.24.0}/docling/cli/models.py +0 -0
  32. {docling-2.23.1 → docling-2.24.0}/docling/cli/tools.py +0 -0
  33. {docling-2.23.1 → docling-2.24.0}/docling/datamodel/__init__.py +0 -0
  34. {docling-2.23.1 → docling-2.24.0}/docling/datamodel/base_models.py +0 -0
  35. {docling-2.23.1 → docling-2.24.0}/docling/datamodel/document.py +0 -0
  36. {docling-2.23.1 → docling-2.24.0}/docling/datamodel/pipeline_options.py +0 -0
  37. {docling-2.23.1 → docling-2.24.0}/docling/datamodel/settings.py +0 -0
  38. {docling-2.23.1 → docling-2.24.0}/docling/document_converter.py +0 -0
  39. {docling-2.23.1 → docling-2.24.0}/docling/exceptions.py +0 -0
  40. {docling-2.23.1 → docling-2.24.0}/docling/models/__init__.py +0 -0
  41. {docling-2.23.1 → docling-2.24.0}/docling/models/base_model.py +0 -0
  42. {docling-2.23.1 → docling-2.24.0}/docling/models/base_ocr_model.py +0 -0
  43. {docling-2.23.1 → docling-2.24.0}/docling/models/code_formula_model.py +0 -0
  44. {docling-2.23.1 → docling-2.24.0}/docling/models/document_picture_classifier.py +0 -0
  45. {docling-2.23.1 → docling-2.24.0}/docling/models/easyocr_model.py +0 -0
  46. {docling-2.23.1 → docling-2.24.0}/docling/models/layout_model.py +0 -0
  47. {docling-2.23.1 → docling-2.24.0}/docling/models/ocr_mac_model.py +0 -0
  48. {docling-2.23.1 → docling-2.24.0}/docling/models/page_preprocessing_model.py +0 -0
  49. {docling-2.23.1 → docling-2.24.0}/docling/models/picture_description_api_model.py +0 -0
  50. {docling-2.23.1 → docling-2.24.0}/docling/models/picture_description_base_model.py +0 -0
  51. {docling-2.23.1 → docling-2.24.0}/docling/models/picture_description_vlm_model.py +0 -0
  52. {docling-2.23.1 → docling-2.24.0}/docling/models/rapid_ocr_model.py +0 -0
  53. {docling-2.23.1 → docling-2.24.0}/docling/models/table_structure_model.py +0 -0
  54. {docling-2.23.1 → docling-2.24.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  55. {docling-2.23.1 → docling-2.24.0}/docling/models/tesseract_ocr_model.py +0 -0
  56. {docling-2.23.1 → docling-2.24.0}/docling/pipeline/__init__.py +0 -0
  57. {docling-2.23.1 → docling-2.24.0}/docling/pipeline/base_pipeline.py +0 -0
  58. {docling-2.23.1 → docling-2.24.0}/docling/pipeline/simple_pipeline.py +0 -0
  59. {docling-2.23.1 → docling-2.24.0}/docling/py.typed +0 -0
  60. {docling-2.23.1 → docling-2.24.0}/docling/utils/__init__.py +0 -0
  61. {docling-2.23.1 → docling-2.24.0}/docling/utils/accelerator_utils.py +0 -0
  62. {docling-2.23.1 → docling-2.24.0}/docling/utils/export.py +0 -0
  63. {docling-2.23.1 → docling-2.24.0}/docling/utils/glm_utils.py +0 -0
  64. {docling-2.23.1 → docling-2.24.0}/docling/utils/layout_postprocessor.py +0 -0
  65. {docling-2.23.1 → docling-2.24.0}/docling/utils/model_downloader.py +0 -0
  66. {docling-2.23.1 → docling-2.24.0}/docling/utils/ocr_utils.py +0 -0
  67. {docling-2.23.1 → docling-2.24.0}/docling/utils/profiling.py +0 -0
  68. {docling-2.23.1 → docling-2.24.0}/docling/utils/utils.py +0 -0
  69. {docling-2.23.1 → docling-2.24.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.23.1
3
+ Version: 2.24.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -27,9 +27,8 @@ Provides-Extra: tesserocr
27
27
  Provides-Extra: vlm
28
28
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
29
29
  Requires-Dist: certifi (>=2024.7.4)
30
- Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
31
30
  Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
32
- Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
+ Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
32
  Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
34
33
  Requires-Dist: easyocr (>=1.7,<2.0)
35
34
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
@@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
52
52
 
53
53
  sanitized_text = "".join(lines)
54
54
 
55
+ # Text normalization
56
+ sanitized_text = sanitized_text.replace("⁄", "/")
57
+ sanitized_text = sanitized_text.replace("’", "'")
58
+ sanitized_text = sanitized_text.replace("‘", "'")
59
+ sanitized_text = sanitized_text.replace("“", '"')
60
+ sanitized_text = sanitized_text.replace("”", '"')
61
+ sanitized_text = sanitized_text.replace("•", "·")
62
+
55
63
  return sanitized_text.strip() # Strip any leading or trailing whitespace
56
64
 
57
65
  def __call__(
@@ -0,0 +1,389 @@
1
+ import copy
2
+ import random
3
+ from pathlib import Path
4
+ from typing import Dict, List
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ DocumentOrigin,
13
+ GroupLabel,
14
+ NodeItem,
15
+ ProvenanceItem,
16
+ RefItem,
17
+ TableData,
18
+ )
19
+ from docling_core.types.doc.document import ContentLayer
20
+ from docling_core.types.legacy_doc.base import Ref
21
+ from docling_core.types.legacy_doc.document import BaseText
22
+ from docling_ibm_models.reading_order.reading_order_rb import (
23
+ PageElement as ReadingOrderPageElement,
24
+ )
25
+ from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
26
+ from PIL import ImageDraw
27
+ from pydantic import BaseModel, ConfigDict
28
+
29
+ from docling.datamodel.base_models import (
30
+ BasePageElement,
31
+ Cluster,
32
+ ContainerElement,
33
+ FigureElement,
34
+ Table,
35
+ TextElement,
36
+ )
37
+ from docling.datamodel.document import ConversionResult
38
+ from docling.datamodel.settings import settings
39
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
40
+
41
+
42
+ class ReadingOrderOptions(BaseModel):
43
+ model_config = ConfigDict(protected_namespaces=())
44
+
45
+ model_names: str = "" # e.g. "language;term;reference"
46
+
47
+
48
+ class ReadingOrderModel:
49
+ def __init__(self, options: ReadingOrderOptions):
50
+ self.options = options
51
+ self.ro_model = ReadingOrderPredictor()
52
+
53
+ def _assembled_to_readingorder_elements(
54
+ self, conv_res: ConversionResult
55
+ ) -> List[ReadingOrderPageElement]:
56
+
57
+ elements: List[ReadingOrderPageElement] = []
58
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
59
+
60
+ for element in conv_res.assembled.elements:
61
+
62
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
63
+ bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
64
+ text = element.text or ""
65
+
66
+ elements.append(
67
+ ReadingOrderPageElement(
68
+ cid=len(elements),
69
+ ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
70
+ text=text,
71
+ page_no=element.page_no,
72
+ page_size=page_no_to_pages[element.page_no].size,
73
+ label=element.label,
74
+ l=bbox.l,
75
+ r=bbox.r,
76
+ b=bbox.b,
77
+ t=bbox.t,
78
+ coord_origin=bbox.coord_origin,
79
+ )
80
+ )
81
+
82
+ return elements
83
+
84
+ def _add_child_elements(
85
+ self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
86
+ ):
87
+
88
+ child: Cluster
89
+ for child in element.cluster.children:
90
+ c_label = child.label
91
+ c_bbox = child.bbox.to_bottom_left_origin(
92
+ doc.pages[element.page_no + 1].size.height
93
+ )
94
+ c_text = " ".join(
95
+ [
96
+ cell.text.replace("\x02", "-").strip()
97
+ for cell in child.cells
98
+ if len(cell.text.strip()) > 0
99
+ ]
100
+ )
101
+
102
+ c_prov = ProvenanceItem(
103
+ page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
104
+ )
105
+ if c_label == DocItemLabel.LIST_ITEM:
106
+ # TODO: Infer if this is a numbered or a bullet list item
107
+ doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
108
+ elif c_label == DocItemLabel.SECTION_HEADER:
109
+ doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
110
+ else:
111
+ doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
112
+
113
+ def _readingorder_elements_to_docling_doc(
114
+ self,
115
+ conv_res: ConversionResult,
116
+ ro_elements: List[ReadingOrderPageElement],
117
+ el_to_captions_mapping: Dict[int, List[int]],
118
+ el_to_footnotes_mapping: Dict[int, List[int]],
119
+ el_merges_mapping: Dict[int, List[int]],
120
+ ) -> DoclingDocument:
121
+
122
+ id_to_elem = {
123
+ RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
124
+ for elem in conv_res.assembled.elements
125
+ }
126
+ cid_to_rels = {rel.cid: rel for rel in ro_elements}
127
+
128
+ origin = DocumentOrigin(
129
+ mimetype="application/pdf",
130
+ filename=conv_res.input.file.name,
131
+ binary_hash=conv_res.input.document_hash,
132
+ )
133
+ doc_name = Path(origin.filename).stem
134
+ out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
135
+
136
+ for page in conv_res.pages:
137
+ page_no = page.page_no + 1
138
+ size = page.size
139
+
140
+ assert size is not None
141
+
142
+ out_doc.add_page(page_no=page_no, size=size)
143
+
144
+ current_list = None
145
+ skippable_cids = {
146
+ cid
147
+ for mapping in (
148
+ el_to_captions_mapping,
149
+ el_to_footnotes_mapping,
150
+ el_merges_mapping,
151
+ )
152
+ for lst in mapping.values()
153
+ for cid in lst
154
+ }
155
+
156
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
157
+
158
+ for rel in ro_elements:
159
+ if rel.cid in skippable_cids:
160
+ continue
161
+ element = id_to_elem[rel.ref.cref]
162
+
163
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
164
+
165
+ if isinstance(element, TextElement):
166
+ if element.label == DocItemLabel.CODE:
167
+ cap_text = element.text
168
+ prov = ProvenanceItem(
169
+ page_no=element.page_no + 1,
170
+ charspan=(0, len(cap_text)),
171
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
172
+ )
173
+ code_item = out_doc.add_code(text=cap_text, prov=prov)
174
+
175
+ if rel.cid in el_to_captions_mapping.keys():
176
+ for caption_cid in el_to_captions_mapping[rel.cid]:
177
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
178
+ new_cap_item = self._add_caption_or_footnote(
179
+ caption_elem, out_doc, code_item, page_height
180
+ )
181
+
182
+ code_item.captions.append(new_cap_item.get_ref())
183
+
184
+ if rel.cid in el_to_footnotes_mapping.keys():
185
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
186
+ footnote_elem = id_to_elem[
187
+ cid_to_rels[footnote_cid].ref.cref
188
+ ]
189
+ new_footnote_item = self._add_caption_or_footnote(
190
+ footnote_elem, out_doc, code_item, page_height
191
+ )
192
+
193
+ code_item.footnotes.append(new_footnote_item.get_ref())
194
+ else:
195
+
196
+ new_item, current_list = self._handle_text_element(
197
+ element, out_doc, current_list, page_height
198
+ )
199
+
200
+ if rel.cid in el_merges_mapping.keys():
201
+ for merged_cid in el_merges_mapping[rel.cid]:
202
+ merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
203
+
204
+ self._merge_elements(
205
+ element, merged_elem, new_item, page_height
206
+ )
207
+
208
+ elif isinstance(element, Table):
209
+
210
+ tbl_data = TableData(
211
+ num_rows=element.num_rows,
212
+ num_cols=element.num_cols,
213
+ table_cells=element.table_cells,
214
+ )
215
+
216
+ prov = ProvenanceItem(
217
+ page_no=element.page_no + 1,
218
+ charspan=(0, 0),
219
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
220
+ )
221
+
222
+ tbl = out_doc.add_table(
223
+ data=tbl_data, prov=prov, label=element.cluster.label
224
+ )
225
+
226
+ if rel.cid in el_to_captions_mapping.keys():
227
+ for caption_cid in el_to_captions_mapping[rel.cid]:
228
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
229
+ new_cap_item = self._add_caption_or_footnote(
230
+ caption_elem, out_doc, tbl, page_height
231
+ )
232
+
233
+ tbl.captions.append(new_cap_item.get_ref())
234
+
235
+ if rel.cid in el_to_footnotes_mapping.keys():
236
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
237
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
238
+ new_footnote_item = self._add_caption_or_footnote(
239
+ footnote_elem, out_doc, tbl, page_height
240
+ )
241
+
242
+ tbl.footnotes.append(new_footnote_item.get_ref())
243
+
244
+ # TODO: Consider adding children of Table.
245
+
246
+ elif isinstance(element, FigureElement):
247
+ cap_text = ""
248
+ prov = ProvenanceItem(
249
+ page_no=element.page_no + 1,
250
+ charspan=(0, len(cap_text)),
251
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
252
+ )
253
+ pic = out_doc.add_picture(prov=prov)
254
+
255
+ if rel.cid in el_to_captions_mapping.keys():
256
+ for caption_cid in el_to_captions_mapping[rel.cid]:
257
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
258
+ new_cap_item = self._add_caption_or_footnote(
259
+ caption_elem, out_doc, pic, page_height
260
+ )
261
+
262
+ pic.captions.append(new_cap_item.get_ref())
263
+
264
+ if rel.cid in el_to_footnotes_mapping.keys():
265
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
266
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
267
+ new_footnote_item = self._add_caption_or_footnote(
268
+ footnote_elem, out_doc, pic, page_height
269
+ )
270
+
271
+ pic.footnotes.append(new_footnote_item.get_ref())
272
+
273
+ self._add_child_elements(element, pic, out_doc)
274
+
275
+ elif isinstance(element, ContainerElement): # Form, KV region
276
+ label = element.label
277
+ group_label = GroupLabel.UNSPECIFIED
278
+ if label == DocItemLabel.FORM:
279
+ group_label = GroupLabel.FORM_AREA
280
+ elif label == DocItemLabel.KEY_VALUE_REGION:
281
+ group_label = GroupLabel.KEY_VALUE_AREA
282
+
283
+ container_el = out_doc.add_group(label=group_label)
284
+
285
+ self._add_child_elements(element, container_el, out_doc)
286
+
287
+ return out_doc
288
+
289
+ def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
290
+ assert isinstance(elem, TextElement)
291
+ text = elem.text
292
+ prov = ProvenanceItem(
293
+ page_no=elem.page_no + 1,
294
+ charspan=(0, len(text)),
295
+ bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
296
+ )
297
+ new_item = out_doc.add_text(
298
+ label=elem.label, text=text, prov=prov, parent=parent
299
+ )
300
+ return new_item
301
+
302
+ def _handle_text_element(self, element, out_doc, current_list, page_height):
303
+ cap_text = element.text
304
+
305
+ prov = ProvenanceItem(
306
+ page_no=element.page_no + 1,
307
+ charspan=(0, len(cap_text)),
308
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
309
+ )
310
+ label = element.label
311
+ if label == DocItemLabel.LIST_ITEM:
312
+ if current_list is None:
313
+ current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
314
+
315
+ # TODO: Infer if this is a numbered or a bullet list item
316
+ new_item = out_doc.add_list_item(
317
+ text=cap_text, enumerated=False, prov=prov, parent=current_list
318
+ )
319
+ elif label == DocItemLabel.SECTION_HEADER:
320
+ current_list = None
321
+
322
+ new_item = out_doc.add_heading(text=cap_text, prov=prov)
323
+ elif label == DocItemLabel.FORMULA:
324
+ current_list = None
325
+
326
+ new_item = out_doc.add_text(
327
+ label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
328
+ )
329
+ else:
330
+ current_list = None
331
+
332
+ content_layer = ContentLayer.BODY
333
+ if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
334
+ content_layer = ContentLayer.FURNITURE
335
+
336
+ new_item = out_doc.add_text(
337
+ label=element.label,
338
+ text=cap_text,
339
+ prov=prov,
340
+ content_layer=content_layer,
341
+ )
342
+ return new_item, current_list
343
+
344
+ def _merge_elements(self, element, merged_elem, new_item, page_height):
345
+ assert isinstance(
346
+ merged_elem, type(element)
347
+ ), "Merged element must be of same type as element."
348
+ assert (
349
+ merged_elem.label == new_item.label
350
+ ), "Labels of merged elements must match."
351
+ prov = ProvenanceItem(
352
+ page_no=element.page_no + 1,
353
+ charspan=(
354
+ len(new_item.text) + 1,
355
+ len(new_item.text) + 1 + len(merged_elem.text),
356
+ ),
357
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
358
+ )
359
+ new_item.text += f" {merged_elem.text}"
360
+ new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
361
+ new_item.prov.append(prov)
362
+
363
+ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
364
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
365
+ page_elements = self._assembled_to_readingorder_elements(conv_res)
366
+
367
+ # Apply reading order
368
+ sorted_elements = self.ro_model.predict_reading_order(
369
+ page_elements=page_elements
370
+ )
371
+ el_to_captions_mapping = self.ro_model.predict_to_captions(
372
+ sorted_elements=sorted_elements
373
+ )
374
+ el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
375
+ sorted_elements=sorted_elements
376
+ )
377
+ el_merges_mapping = self.ro_model.predict_merges(
378
+ sorted_elements=sorted_elements
379
+ )
380
+
381
+ docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
382
+ conv_res,
383
+ sorted_elements,
384
+ el_to_captions_mapping,
385
+ el_to_footnotes_mapping,
386
+ el_merges_mapping,
387
+ )
388
+
389
+ return docling_doc
@@ -27,7 +27,6 @@ from docling.models.document_picture_classifier import (
27
27
  DocumentPictureClassifier,
28
28
  DocumentPictureClassifierOptions,
29
29
  )
30
- from docling.models.ds_glm_model import GlmModel, GlmOptions
31
30
  from docling.models.easyocr_model import EasyOcrModel
32
31
  from docling.models.layout_model import LayoutModel
33
32
  from docling.models.ocr_mac_model import OcrMacModel
@@ -40,6 +39,7 @@ from docling.models.picture_description_api_model import PictureDescriptionApiMo
40
39
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
41
40
  from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
42
41
  from docling.models.rapid_ocr_model import RapidOcrModel
42
+ from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
43
43
  from docling.models.table_structure_model import TableStructureModel
44
44
  from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
45
45
  from docling.models.tesseract_ocr_model import TesseractOcrModel
@@ -76,7 +76,7 @@ class StandardPdfPipeline(PaginatedPipeline):
76
76
  or self.pipeline_options.generate_table_images
77
77
  )
78
78
 
79
- self.glm_model = GlmModel(options=GlmOptions())
79
+ self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
80
80
 
81
81
  if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
82
82
  raise RuntimeError(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.23.1" # DO NOT EDIT, updated automatically
3
+ version = "2.24.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -27,8 +27,7 @@ packages = [{include = "docling"}]
27
27
  python = "^3.9"
28
28
  pydantic = "^2.0.0"
29
29
  docling-core = {extras = ["chunking"], version = "^2.19.0"}
30
- docling-ibm-models = "^3.3.0"
31
- deepsearch-glm = "^1.0.0"
30
+ docling-ibm-models = "^3.4.0"
32
31
  docling-parse = "^3.3.0"
33
32
  filetype = "^1.2.0"
34
33
  pypdfium2 = "^4.30.0"
@@ -164,7 +163,6 @@ module = [
164
163
  "docling_ibm_models.*",
165
164
  "easyocr.*",
166
165
  "ocrmac.*",
167
- "deepsearch_glm.*",
168
166
  "lxml.*",
169
167
  "huggingface_hub.*",
170
168
  "transformers.*",
@@ -1,386 +0,0 @@
1
- import copy
2
- import random
3
- from pathlib import Path
4
- from typing import List, Union
5
-
6
- from deepsearch_glm.andromeda_nlp import nlp_model
7
- from docling_core.types.doc import (
8
- BoundingBox,
9
- CoordOrigin,
10
- DocItemLabel,
11
- DoclingDocument,
12
- )
13
- from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
14
- from docling_core.types.legacy_doc.base import (
15
- Figure,
16
- PageDimensions,
17
- PageReference,
18
- Prov,
19
- Ref,
20
- )
21
- from docling_core.types.legacy_doc.base import Table as DsSchemaTable
22
- from docling_core.types.legacy_doc.base import TableCell
23
- from docling_core.types.legacy_doc.document import BaseText
24
- from docling_core.types.legacy_doc.document import (
25
- CCSDocumentDescription as DsDocumentDescription,
26
- )
27
- from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
28
- from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
29
- from PIL import ImageDraw
30
- from pydantic import BaseModel, ConfigDict, TypeAdapter
31
-
32
- from docling.datamodel.base_models import (
33
- Cluster,
34
- ContainerElement,
35
- FigureElement,
36
- Table,
37
- TextElement,
38
- )
39
- from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
40
- from docling.datamodel.settings import settings
41
- from docling.utils.glm_utils import to_docling_document
42
- from docling.utils.profiling import ProfilingScope, TimeRecorder
43
- from docling.utils.utils import create_hash
44
-
45
-
46
- class GlmOptions(BaseModel):
47
- model_config = ConfigDict(protected_namespaces=())
48
-
49
- model_names: str = "" # e.g. "language;term;reference"
50
-
51
-
52
- class GlmModel:
53
- def __init__(self, options: GlmOptions):
54
- self.options = options
55
-
56
- self.model = nlp_model(loglevel="error", text_ordering=True)
57
-
58
- def _to_legacy_document(self, conv_res) -> DsDocument:
59
- title = ""
60
- desc: DsDocumentDescription = DsDocumentDescription(logs=[])
61
-
62
- page_hashes = [
63
- PageReference(
64
- hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
65
- page=p.page_no + 1,
66
- model="default",
67
- )
68
- for p in conv_res.pages
69
- ]
70
-
71
- file_info = DsFileInfoObject(
72
- filename=conv_res.input.file.name,
73
- document_hash=conv_res.input.document_hash,
74
- num_pages=conv_res.input.page_count,
75
- page_hashes=page_hashes,
76
- )
77
-
78
- main_text: List[Union[Ref, BaseText]] = []
79
- page_headers: List[Union[Ref, BaseText]] = []
80
- page_footers: List[Union[Ref, BaseText]] = []
81
-
82
- tables: List[DsSchemaTable] = []
83
- figures: List[Figure] = []
84
-
85
- page_no_to_page = {p.page_no: p for p in conv_res.pages}
86
-
87
- for element in conv_res.assembled.body:
88
- # Convert bboxes to lower-left origin.
89
- target_bbox = DsBoundingBox(
90
- element.cluster.bbox.to_bottom_left_origin(
91
- page_no_to_page[element.page_no].size.height
92
- ).as_tuple()
93
- )
94
-
95
- if isinstance(element, TextElement):
96
- main_text.append(
97
- BaseText(
98
- text=element.text,
99
- obj_type=layout_label_to_ds_type.get(element.label),
100
- name=element.label,
101
- prov=[
102
- Prov(
103
- bbox=target_bbox,
104
- page=element.page_no + 1,
105
- span=[0, len(element.text)],
106
- )
107
- ],
108
- )
109
- )
110
- elif isinstance(element, Table):
111
- index = len(tables)
112
- ref_str = f"#/tables/{index}"
113
- main_text.append(
114
- Ref(
115
- name=element.label,
116
- obj_type=layout_label_to_ds_type.get(element.label),
117
- ref=ref_str,
118
- ),
119
- )
120
-
121
- # Initialise empty table data grid (only empty cells)
122
- table_data = [
123
- [
124
- TableCell(
125
- text="",
126
- # bbox=[0,0,0,0],
127
- spans=[[i, j]],
128
- obj_type="body",
129
- )
130
- for j in range(element.num_cols)
131
- ]
132
- for i in range(element.num_rows)
133
- ]
134
-
135
- # Overwrite cells in table data for which there is actual cell content.
136
- for cell in element.table_cells:
137
- for i in range(
138
- min(cell.start_row_offset_idx, element.num_rows),
139
- min(cell.end_row_offset_idx, element.num_rows),
140
- ):
141
- for j in range(
142
- min(cell.start_col_offset_idx, element.num_cols),
143
- min(cell.end_col_offset_idx, element.num_cols),
144
- ):
145
- celltype = "body"
146
- if cell.column_header:
147
- celltype = "col_header"
148
- elif cell.row_header:
149
- celltype = "row_header"
150
- elif cell.row_section:
151
- celltype = "row_section"
152
-
153
- def make_spans(cell):
154
- for rspan in range(
155
- min(cell.start_row_offset_idx, element.num_rows),
156
- min(cell.end_row_offset_idx, element.num_rows),
157
- ):
158
- for cspan in range(
159
- min(
160
- cell.start_col_offset_idx, element.num_cols
161
- ),
162
- min(cell.end_col_offset_idx, element.num_cols),
163
- ):
164
- yield [rspan, cspan]
165
-
166
- spans = list(make_spans(cell))
167
- if cell.bbox is not None:
168
- bbox = cell.bbox.to_bottom_left_origin(
169
- page_no_to_page[element.page_no].size.height
170
- ).as_tuple()
171
- else:
172
- bbox = None
173
-
174
- table_data[i][j] = TableCell(
175
- text=cell.text,
176
- bbox=bbox,
177
- # col=j,
178
- # row=i,
179
- spans=spans,
180
- obj_type=celltype,
181
- # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
182
- # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
183
- )
184
-
185
- tables.append(
186
- DsSchemaTable(
187
- num_cols=element.num_cols,
188
- num_rows=element.num_rows,
189
- obj_type=layout_label_to_ds_type.get(element.label),
190
- data=table_data,
191
- prov=[
192
- Prov(
193
- bbox=target_bbox,
194
- page=element.page_no + 1,
195
- span=[0, 0],
196
- )
197
- ],
198
- )
199
- )
200
-
201
- elif isinstance(element, FigureElement):
202
- index = len(figures)
203
- ref_str = f"#/figures/{index}"
204
- main_text.append(
205
- Ref(
206
- name=element.label,
207
- obj_type=layout_label_to_ds_type.get(element.label),
208
- ref=ref_str,
209
- ),
210
- )
211
- figures.append(
212
- Figure(
213
- prov=[
214
- Prov(
215
- bbox=target_bbox,
216
- page=element.page_no + 1,
217
- span=[0, 0],
218
- )
219
- ],
220
- obj_type=layout_label_to_ds_type.get(element.label),
221
- payload={
222
- "children": TypeAdapter(List[Cluster]).dump_python(
223
- element.cluster.children
224
- )
225
- }, # hack to channel child clusters through GLM
226
- )
227
- )
228
- elif isinstance(element, ContainerElement):
229
- main_text.append(
230
- BaseText(
231
- text="",
232
- payload={
233
- "children": TypeAdapter(List[Cluster]).dump_python(
234
- element.cluster.children
235
- )
236
- }, # hack to channel child clusters through GLM
237
- obj_type=layout_label_to_ds_type.get(element.label),
238
- name=element.label,
239
- prov=[
240
- Prov(
241
- bbox=target_bbox,
242
- page=element.page_no + 1,
243
- span=[0, 0],
244
- )
245
- ],
246
- )
247
- )
248
-
249
- # We can throw in headers and footers at the end of the legacy doc
250
- # since the reading-order will re-sort it later.
251
- for element in conv_res.assembled.headers:
252
- # Convert bboxes to lower-left origin.
253
- target_bbox = DsBoundingBox(
254
- element.cluster.bbox.to_bottom_left_origin(
255
- page_no_to_page[element.page_no].size.height
256
- ).as_tuple()
257
- )
258
-
259
- if isinstance(element, TextElement):
260
-
261
- tel = BaseText(
262
- text=element.text,
263
- obj_type=layout_label_to_ds_type.get(element.label),
264
- name=element.label,
265
- prov=[
266
- Prov(
267
- bbox=target_bbox,
268
- page=element.page_no + 1,
269
- span=[0, len(element.text)],
270
- )
271
- ],
272
- )
273
- if element.label == DocItemLabel.PAGE_HEADER:
274
- index = len(page_headers)
275
- ref_str = f"#/page-headers/{index}"
276
- main_text.append(
277
- Ref(
278
- name=element.label,
279
- obj_type=layout_label_to_ds_type.get(element.label),
280
- ref=ref_str,
281
- ),
282
- )
283
- page_headers.append(tel)
284
- elif element.label == DocItemLabel.PAGE_FOOTER:
285
- index = len(page_footers)
286
- ref_str = f"#/page-footers/{index}"
287
- main_text.append(
288
- Ref(
289
- name=element.label,
290
- obj_type=layout_label_to_ds_type.get(element.label),
291
- ref=ref_str,
292
- ),
293
- )
294
- page_footers.append(tel)
295
-
296
- page_dimensions = [
297
- PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
298
- for p in conv_res.pages
299
- if p.size is not None
300
- ]
301
-
302
- ds_doc: DsDocument = DsDocument(
303
- name=title,
304
- description=desc,
305
- file_info=file_info,
306
- main_text=main_text,
307
- tables=tables,
308
- figures=figures,
309
- page_dimensions=page_dimensions,
310
- page_headers=page_headers,
311
- page_footers=page_footers,
312
- )
313
-
314
- return ds_doc
315
-
316
- def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
317
- with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
318
- ds_doc = self._to_legacy_document(conv_res)
319
- ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
320
-
321
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
322
-
323
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
324
- 1 == 1
325
-
326
- # DEBUG code:
327
- def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
328
- clusters_to_draw = []
329
- image = copy.deepcopy(conv_res.pages[page_no].image)
330
- for ix, elem in enumerate(ds_document.main_text):
331
- if isinstance(elem, BaseText):
332
- prov = elem.prov[0] # type: ignore
333
- elif isinstance(elem, Ref):
334
- _, arr, index = elem.ref.split("/")
335
- index = int(index) # type: ignore
336
- if arr == "tables":
337
- prov = ds_document.tables[index].prov[0]
338
- elif arr == "figures":
339
- prov = ds_document.pictures[index].prov[0]
340
- else:
341
- prov = None
342
-
343
- if prov and prov.page == page_no:
344
- clusters_to_draw.append(
345
- Cluster(
346
- id=ix,
347
- label=elem.name,
348
- bbox=BoundingBox.from_tuple(
349
- coord=prov.bbox, # type: ignore
350
- origin=CoordOrigin.BOTTOMLEFT,
351
- ).to_top_left_origin(conv_res.pages[page_no].size.height),
352
- )
353
- )
354
-
355
- draw = ImageDraw.Draw(image)
356
- for c in clusters_to_draw:
357
- x0, y0, x1, y1 = c.bbox.as_tuple()
358
- draw.rectangle([(x0, y0), (x1, y1)], outline="red")
359
- draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
360
-
361
- cell_color = (
362
- random.randint(30, 140),
363
- random.randint(30, 140),
364
- random.randint(30, 140),
365
- )
366
- for tc in c.cells: # [:1]:
367
- x0, y0, x1, y1 = tc.bbox.as_tuple()
368
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
369
-
370
- if show:
371
- image.show()
372
- else:
373
- out_path: Path = (
374
- Path(settings.debug.debug_output_path)
375
- / f"debug_{conv_res.input.file.stem}"
376
- )
377
- out_path.mkdir(parents=True, exist_ok=True)
378
-
379
- out_file = out_path / f"doc_page_{page_no:05}.png"
380
- image.save(str(out_file), format="png")
381
-
382
- # for item in ds_doc.page_dimensions:
383
- # page_no = item.page
384
- # draw_clusters_and_cells(ds_doc, page_no)
385
-
386
- return docling_doc
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes