docling 2.20.0__py3-none-any.whl → 2.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,12 @@ from pathlib import Path
4
4
  from typing import List, Union
5
5
 
6
6
  from deepsearch_glm.andromeda_nlp import nlp_model
7
- from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
7
+ from docling_core.types.doc import (
8
+ BoundingBox,
9
+ CoordOrigin,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ )
8
13
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
9
14
  from docling_core.types.legacy_doc.base import (
10
15
  Figure,
@@ -71,12 +76,15 @@ class GlmModel:
71
76
  )
72
77
 
73
78
  main_text: List[Union[Ref, BaseText]] = []
79
+ page_headers: List[Union[Ref, BaseText]] = []
80
+ page_footers: List[Union[Ref, BaseText]] = []
81
+
74
82
  tables: List[DsSchemaTable] = []
75
83
  figures: List[Figure] = []
76
84
 
77
85
  page_no_to_page = {p.page_no: p for p in conv_res.pages}
78
86
 
79
- for element in conv_res.assembled.elements:
87
+ for element in conv_res.assembled.body:
80
88
  # Convert bboxes to lower-left origin.
81
89
  target_bbox = DsBoundingBox(
82
90
  element.cluster.bbox.to_bottom_left_origin(
@@ -238,6 +246,53 @@ class GlmModel:
238
246
  )
239
247
  )
240
248
 
249
+ # We can throw in headers and footers at the end of the legacy doc
250
+ # since the reading-order will re-sort it later.
251
+ for element in conv_res.assembled.headers:
252
+ # Convert bboxes to lower-left origin.
253
+ target_bbox = DsBoundingBox(
254
+ element.cluster.bbox.to_bottom_left_origin(
255
+ page_no_to_page[element.page_no].size.height
256
+ ).as_tuple()
257
+ )
258
+
259
+ if isinstance(element, TextElement):
260
+
261
+ tel = BaseText(
262
+ text=element.text,
263
+ obj_type=layout_label_to_ds_type.get(element.label),
264
+ name=element.label,
265
+ prov=[
266
+ Prov(
267
+ bbox=target_bbox,
268
+ page=element.page_no + 1,
269
+ span=[0, len(element.text)],
270
+ )
271
+ ],
272
+ )
273
+ if element.label == DocItemLabel.PAGE_HEADER:
274
+ index = len(page_headers)
275
+ ref_str = f"#/page-headers/{index}"
276
+ main_text.append(
277
+ Ref(
278
+ name=element.label,
279
+ obj_type=layout_label_to_ds_type.get(element.label),
280
+ ref=ref_str,
281
+ ),
282
+ )
283
+ page_headers.append(tel)
284
+ elif element.label == DocItemLabel.PAGE_FOOTER:
285
+ index = len(page_footers)
286
+ ref_str = f"#/page-footers/{index}"
287
+ main_text.append(
288
+ Ref(
289
+ name=element.label,
290
+ obj_type=layout_label_to_ds_type.get(element.label),
291
+ ref=ref_str,
292
+ ),
293
+ )
294
+ page_footers.append(tel)
295
+
241
296
  page_dimensions = [
242
297
  PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
243
298
  for p in conv_res.pages
@@ -252,6 +307,8 @@ class GlmModel:
252
307
  tables=tables,
253
308
  figures=figures,
254
309
  page_dimensions=page_dimensions,
310
+ page_headers=page_headers,
311
+ page_footers=page_footers,
255
312
  )
256
313
 
257
314
  return ds_doc
@@ -264,6 +321,7 @@ class GlmModel:
264
321
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
265
322
 
266
323
  docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
324
+ 1 == 1
267
325
 
268
326
  # DEBUG code:
269
327
  def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
15
15
  TableCell,
16
16
  TableData,
17
17
  )
18
+ from docling_core.types.doc.document import ContentLayer
18
19
 
19
20
 
20
21
  def resolve_item(paths, obj):
@@ -311,6 +312,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
311
312
  current_list = None
312
313
 
313
314
  doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
315
+ elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
316
+ current_list = None
317
+
318
+ doc.add_text(
319
+ label=DocItemLabel(name_label),
320
+ text=text,
321
+ prov=prov,
322
+ content_layer=ContentLayer.FURNITURE,
323
+ )
314
324
  else:
315
325
  current_list = None
316
326
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.20.0
3
+ Version: 2.21.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
29
29
  Requires-Dist: certifi (>=2024.7.4)
30
30
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
31
- Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.18.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -33,7 +33,7 @@ docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,
33
33
  docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
34
34
  docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
35
35
  docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
36
- docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
36
+ docling/models/ds_glm_model.py,sha256=1jLEM-B_oHFevKq23zDQpdifE3eJL7qiLr5YLpEf1kQ,15217
37
37
  docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
38
38
  docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
39
39
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
@@ -54,15 +54,15 @@ docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
54
54
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
56
56
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
57
- docling/utils/glm_utils.py,sha256=uyCoFTX9FbS1Ke0aSlkdzGLUt08dZfkgriWadkyLiiA,11856
57
+ docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
58
58
  docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
59
59
  docling/utils/model_downloader.py,sha256=XK3ozGXyQcNPvrSsevTwR9VnY41JWovlsGk_ZBnu6FU,2787
60
60
  docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
61
61
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
62
62
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
63
63
  docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
64
- docling-2.20.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
65
- docling-2.20.0.dist-info/METADATA,sha256=9g0XmFk8hxdswqwT5jWnrUPKswGA26JDlyV5hqJ_tCc,8720
66
- docling-2.20.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
67
- docling-2.20.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
68
- docling-2.20.0.dist-info/RECORD,,
64
+ docling-2.21.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
65
+ docling-2.21.0.dist-info/METADATA,sha256=HX1DmAVvGHCA61nnfg2pjFOEtMPVV_0ou9YWgfGEVhU,8720
66
+ docling-2.21.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
67
+ docling-2.21.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
68
+ docling-2.21.0.dist-info/RECORD,,