docling 2.20.0__py3-none-any.whl → 2.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/models/ds_glm_model.py +60 -2
- docling/utils/glm_utils.py +10 -0
- {docling-2.20.0.dist-info → docling-2.21.0.dist-info}/METADATA +2 -2
- {docling-2.20.0.dist-info → docling-2.21.0.dist-info}/RECORD +7 -7
- {docling-2.20.0.dist-info → docling-2.21.0.dist-info}/LICENSE +0 -0
- {docling-2.20.0.dist-info → docling-2.21.0.dist-info}/WHEEL +0 -0
- {docling-2.20.0.dist-info → docling-2.21.0.dist-info}/entry_points.txt +0 -0
docling/models/ds_glm_model.py
CHANGED
@@ -4,7 +4,12 @@ from pathlib import Path
|
|
4
4
|
from typing import List, Union
|
5
5
|
|
6
6
|
from deepsearch_glm.andromeda_nlp import nlp_model
|
7
|
-
from docling_core.types.doc import
|
7
|
+
from docling_core.types.doc import (
|
8
|
+
BoundingBox,
|
9
|
+
CoordOrigin,
|
10
|
+
DocItemLabel,
|
11
|
+
DoclingDocument,
|
12
|
+
)
|
8
13
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
9
14
|
from docling_core.types.legacy_doc.base import (
|
10
15
|
Figure,
|
@@ -71,12 +76,15 @@ class GlmModel:
|
|
71
76
|
)
|
72
77
|
|
73
78
|
main_text: List[Union[Ref, BaseText]] = []
|
79
|
+
page_headers: List[Union[Ref, BaseText]] = []
|
80
|
+
page_footers: List[Union[Ref, BaseText]] = []
|
81
|
+
|
74
82
|
tables: List[DsSchemaTable] = []
|
75
83
|
figures: List[Figure] = []
|
76
84
|
|
77
85
|
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
78
86
|
|
79
|
-
for element in conv_res.assembled.
|
87
|
+
for element in conv_res.assembled.body:
|
80
88
|
# Convert bboxes to lower-left origin.
|
81
89
|
target_bbox = DsBoundingBox(
|
82
90
|
element.cluster.bbox.to_bottom_left_origin(
|
@@ -238,6 +246,53 @@ class GlmModel:
|
|
238
246
|
)
|
239
247
|
)
|
240
248
|
|
249
|
+
# We can throw in headers and footers at the end of the legacy doc
|
250
|
+
# since the reading-order will re-sort it later.
|
251
|
+
for element in conv_res.assembled.headers:
|
252
|
+
# Convert bboxes to lower-left origin.
|
253
|
+
target_bbox = DsBoundingBox(
|
254
|
+
element.cluster.bbox.to_bottom_left_origin(
|
255
|
+
page_no_to_page[element.page_no].size.height
|
256
|
+
).as_tuple()
|
257
|
+
)
|
258
|
+
|
259
|
+
if isinstance(element, TextElement):
|
260
|
+
|
261
|
+
tel = BaseText(
|
262
|
+
text=element.text,
|
263
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
264
|
+
name=element.label,
|
265
|
+
prov=[
|
266
|
+
Prov(
|
267
|
+
bbox=target_bbox,
|
268
|
+
page=element.page_no + 1,
|
269
|
+
span=[0, len(element.text)],
|
270
|
+
)
|
271
|
+
],
|
272
|
+
)
|
273
|
+
if element.label == DocItemLabel.PAGE_HEADER:
|
274
|
+
index = len(page_headers)
|
275
|
+
ref_str = f"#/page-headers/{index}"
|
276
|
+
main_text.append(
|
277
|
+
Ref(
|
278
|
+
name=element.label,
|
279
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
280
|
+
ref=ref_str,
|
281
|
+
),
|
282
|
+
)
|
283
|
+
page_headers.append(tel)
|
284
|
+
elif element.label == DocItemLabel.PAGE_FOOTER:
|
285
|
+
index = len(page_footers)
|
286
|
+
ref_str = f"#/page-footers/{index}"
|
287
|
+
main_text.append(
|
288
|
+
Ref(
|
289
|
+
name=element.label,
|
290
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
291
|
+
ref=ref_str,
|
292
|
+
),
|
293
|
+
)
|
294
|
+
page_footers.append(tel)
|
295
|
+
|
241
296
|
page_dimensions = [
|
242
297
|
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
243
298
|
for p in conv_res.pages
|
@@ -252,6 +307,8 @@ class GlmModel:
|
|
252
307
|
tables=tables,
|
253
308
|
figures=figures,
|
254
309
|
page_dimensions=page_dimensions,
|
310
|
+
page_headers=page_headers,
|
311
|
+
page_footers=page_footers,
|
255
312
|
)
|
256
313
|
|
257
314
|
return ds_doc
|
@@ -264,6 +321,7 @@ class GlmModel:
|
|
264
321
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
265
322
|
|
266
323
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
324
|
+
1 == 1
|
267
325
|
|
268
326
|
# DEBUG code:
|
269
327
|
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
docling/utils/glm_utils.py
CHANGED
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
|
|
15
15
|
TableCell,
|
16
16
|
TableData,
|
17
17
|
)
|
18
|
+
from docling_core.types.doc.document import ContentLayer
|
18
19
|
|
19
20
|
|
20
21
|
def resolve_item(paths, obj):
|
@@ -311,6 +312,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
311
312
|
current_list = None
|
312
313
|
|
313
314
|
doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
|
315
|
+
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
316
|
+
current_list = None
|
317
|
+
|
318
|
+
doc.add_text(
|
319
|
+
label=DocItemLabel(name_label),
|
320
|
+
text=text,
|
321
|
+
prov=prov,
|
322
|
+
content_layer=ContentLayer.FURNITURE,
|
323
|
+
)
|
314
324
|
else:
|
315
325
|
current_list = None
|
316
326
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.21.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
29
29
|
Requires-Dist: certifi (>=2024.7.4)
|
30
30
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.18.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -33,7 +33,7 @@ docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,
|
|
33
33
|
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
34
34
|
docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
|
35
35
|
docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
|
36
|
-
docling/models/ds_glm_model.py,sha256=
|
36
|
+
docling/models/ds_glm_model.py,sha256=1jLEM-B_oHFevKq23zDQpdifE3eJL7qiLr5YLpEf1kQ,15217
|
37
37
|
docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
|
38
38
|
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
39
39
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
@@ -54,15 +54,15 @@ docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
54
54
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
55
|
docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
|
56
56
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
57
|
-
docling/utils/glm_utils.py,sha256=
|
57
|
+
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
58
58
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
59
59
|
docling/utils/model_downloader.py,sha256=XK3ozGXyQcNPvrSsevTwR9VnY41JWovlsGk_ZBnu6FU,2787
|
60
60
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
61
61
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
62
62
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
63
63
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
64
|
-
docling-2.
|
65
|
-
docling-2.
|
66
|
-
docling-2.
|
67
|
-
docling-2.
|
68
|
-
docling-2.
|
64
|
+
docling-2.21.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
65
|
+
docling-2.21.0.dist-info/METADATA,sha256=HX1DmAVvGHCA61nnfg2pjFOEtMPVV_0ou9YWgfGEVhU,8720
|
66
|
+
docling-2.21.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
67
|
+
docling-2.21.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
68
|
+
docling-2.21.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|