docling-core 2.33.1__py3-none-any.whl → 2.34.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/base.py +34 -0
- docling_core/transforms/serializer/common.py +37 -3
- docling_core/transforms/serializer/doctags.py +65 -6
- docling_core/transforms/serializer/html.py +61 -23
- docling_core/transforms/serializer/markdown.py +85 -18
- docling_core/types/doc/document.py +213 -51
- {docling_core-2.33.1.dist-info → docling_core-2.34.1.dist-info}/METADATA +1 -1
- {docling_core-2.33.1.dist-info → docling_core-2.34.1.dist-info}/RECORD +12 -12
- {docling_core-2.33.1.dist-info → docling_core-2.34.1.dist-info}/WHEEL +0 -0
- {docling_core-2.33.1.dist-info → docling_core-2.34.1.dist-info}/entry_points.txt +0 -0
- {docling_core-2.33.1.dist-info → docling_core-2.34.1.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.33.1.dist-info → docling_core-2.34.1.dist-info}/top_level.txt +0 -0
|
@@ -202,6 +202,16 @@ class BaseDocSerializer(ABC):
|
|
|
202
202
|
"""Hook for strikethrough formatting serialization."""
|
|
203
203
|
...
|
|
204
204
|
|
|
205
|
+
@abstractmethod
|
|
206
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
207
|
+
"""Hook for subscript formatting serialization."""
|
|
208
|
+
...
|
|
209
|
+
|
|
210
|
+
@abstractmethod
|
|
211
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
212
|
+
"""Hook for superscript formatting serialization."""
|
|
213
|
+
...
|
|
214
|
+
|
|
205
215
|
@abstractmethod
|
|
206
216
|
def serialize_hyperlink(
|
|
207
217
|
self,
|
|
@@ -239,6 +249,15 @@ class BaseDocSerializer(ABC):
|
|
|
239
249
|
"""Serialize the item's captions."""
|
|
240
250
|
...
|
|
241
251
|
|
|
252
|
+
@abstractmethod
|
|
253
|
+
def serialize_annotations(
|
|
254
|
+
self,
|
|
255
|
+
item: DocItem,
|
|
256
|
+
**kwargs: Any,
|
|
257
|
+
) -> SerializationResult:
|
|
258
|
+
"""Serialize the item's annotations."""
|
|
259
|
+
...
|
|
260
|
+
|
|
242
261
|
@abstractmethod
|
|
243
262
|
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
244
263
|
"""Get references to excluded items."""
|
|
@@ -257,3 +276,18 @@ class BaseSerializerProvider(ABC):
|
|
|
257
276
|
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
|
|
258
277
|
"""Get a the associated serializer."""
|
|
259
278
|
...
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class BaseAnnotationSerializer(ABC):
|
|
282
|
+
"""Base class for annotation serializers."""
|
|
283
|
+
|
|
284
|
+
@abstractmethod
|
|
285
|
+
def serialize(
|
|
286
|
+
self,
|
|
287
|
+
*,
|
|
288
|
+
item: DocItem,
|
|
289
|
+
doc: DoclingDocument,
|
|
290
|
+
**kwargs: Any,
|
|
291
|
+
) -> SerializationResult:
|
|
292
|
+
"""Serializes the passed annotation."""
|
|
293
|
+
...
|
|
@@ -15,6 +15,7 @@ from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_fie
|
|
|
15
15
|
from typing_extensions import Self, override
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.serializer.base import (
|
|
18
|
+
BaseAnnotationSerializer,
|
|
18
19
|
BaseDocSerializer,
|
|
19
20
|
BaseFallbackSerializer,
|
|
20
21
|
BaseFormSerializer,
|
|
@@ -30,6 +31,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
30
31
|
from docling_core.types.doc.document import (
|
|
31
32
|
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
32
33
|
ContentLayer,
|
|
34
|
+
DescriptionAnnotation,
|
|
33
35
|
DocItem,
|
|
34
36
|
DoclingDocument,
|
|
35
37
|
FloatingItem,
|
|
@@ -41,9 +43,10 @@ from docling_core.types.doc.document import (
|
|
|
41
43
|
OrderedList,
|
|
42
44
|
PictureClassificationData,
|
|
43
45
|
PictureDataType,
|
|
44
|
-
PictureDescriptionData,
|
|
45
46
|
PictureItem,
|
|
46
47
|
PictureMoleculeData,
|
|
48
|
+
Script,
|
|
49
|
+
TableAnnotationType,
|
|
47
50
|
TableItem,
|
|
48
51
|
TextItem,
|
|
49
52
|
UnorderedList,
|
|
@@ -122,7 +125,9 @@ def _iterate_items(
|
|
|
122
125
|
yield item
|
|
123
126
|
|
|
124
127
|
|
|
125
|
-
def
|
|
128
|
+
def _get_annotation_text(
|
|
129
|
+
annotation: Union[PictureDataType, TableAnnotationType],
|
|
130
|
+
) -> Optional[str]:
|
|
126
131
|
result = None
|
|
127
132
|
if isinstance(annotation, PictureClassificationData):
|
|
128
133
|
predicted_class = (
|
|
@@ -132,7 +137,7 @@ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
|
|
|
132
137
|
)
|
|
133
138
|
if predicted_class is not None:
|
|
134
139
|
result = predicted_class.replace("_", " ")
|
|
135
|
-
elif isinstance(annotation,
|
|
140
|
+
elif isinstance(annotation, DescriptionAnnotation):
|
|
136
141
|
result = annotation.text
|
|
137
142
|
elif isinstance(annotation, PictureMoleculeData):
|
|
138
143
|
result = annotation.smi
|
|
@@ -211,6 +216,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
211
216
|
list_serializer: BaseListSerializer
|
|
212
217
|
inline_serializer: BaseInlineSerializer
|
|
213
218
|
|
|
219
|
+
annotation_serializer: BaseAnnotationSerializer
|
|
220
|
+
|
|
214
221
|
params: CommonParams = CommonParams()
|
|
215
222
|
|
|
216
223
|
_excluded_refs_cache: dict[str, set[str]] = {}
|
|
@@ -449,6 +456,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
449
456
|
res = self.serialize_underline(text=res)
|
|
450
457
|
if formatting.strikethrough:
|
|
451
458
|
res = self.serialize_strikethrough(text=res)
|
|
459
|
+
if formatting.script == Script.SUB:
|
|
460
|
+
res = self.serialize_subscript(text=res)
|
|
461
|
+
elif formatting.script == Script.SUPER:
|
|
462
|
+
res = self.serialize_superscript(text=res)
|
|
452
463
|
if params.include_hyperlinks and hyperlink:
|
|
453
464
|
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
|
|
454
465
|
return res
|
|
@@ -473,6 +484,16 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
473
484
|
"""Hook for strikethrough formatting serialization."""
|
|
474
485
|
return text
|
|
475
486
|
|
|
487
|
+
@override
|
|
488
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
489
|
+
"""Hook for subscript formatting serialization."""
|
|
490
|
+
return text
|
|
491
|
+
|
|
492
|
+
@override
|
|
493
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
494
|
+
"""Hook for superscript formatting serialization."""
|
|
495
|
+
return text
|
|
496
|
+
|
|
476
497
|
@override
|
|
477
498
|
def serialize_hyperlink(
|
|
478
499
|
self,
|
|
@@ -505,6 +526,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
505
526
|
text_res = ""
|
|
506
527
|
return create_ser_result(text=text_res, span_source=results)
|
|
507
528
|
|
|
529
|
+
@override
|
|
530
|
+
def serialize_annotations(
|
|
531
|
+
self,
|
|
532
|
+
item: DocItem,
|
|
533
|
+
**kwargs: Any,
|
|
534
|
+
) -> SerializationResult:
|
|
535
|
+
"""Serialize the item's annotations."""
|
|
536
|
+
return self.annotation_serializer.serialize(
|
|
537
|
+
item=item,
|
|
538
|
+
doc=self.doc,
|
|
539
|
+
**kwargs,
|
|
540
|
+
)
|
|
541
|
+
|
|
508
542
|
def _get_applicable_pages(self) -> Optional[list[int]]:
|
|
509
543
|
pages = {
|
|
510
544
|
item.prov[0].page_no: ...
|
|
@@ -7,6 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
9
|
from docling_core.transforms.serializer.base import (
|
|
10
|
+
BaseAnnotationSerializer,
|
|
10
11
|
BaseDocSerializer,
|
|
11
12
|
BaseFallbackSerializer,
|
|
12
13
|
BaseFormSerializer,
|
|
@@ -17,12 +18,14 @@ from docling_core.transforms.serializer.base import (
|
|
|
17
18
|
BaseTableSerializer,
|
|
18
19
|
BaseTextSerializer,
|
|
19
20
|
SerializationResult,
|
|
21
|
+
Span,
|
|
20
22
|
)
|
|
21
23
|
from docling_core.transforms.serializer.common import (
|
|
22
24
|
CommonParams,
|
|
23
25
|
DocSerializer,
|
|
24
26
|
create_ser_result,
|
|
25
27
|
)
|
|
28
|
+
from docling_core.types.doc.base import BoundingBox
|
|
26
29
|
from docling_core.types.doc.document import (
|
|
27
30
|
CodeItem,
|
|
28
31
|
DocItem,
|
|
@@ -38,6 +41,7 @@ from docling_core.types.doc.document import (
|
|
|
38
41
|
PictureItem,
|
|
39
42
|
PictureMoleculeData,
|
|
40
43
|
PictureTabularChartData,
|
|
44
|
+
ProvenanceItem,
|
|
41
45
|
TableItem,
|
|
42
46
|
TextItem,
|
|
43
47
|
UnorderedList,
|
|
@@ -414,6 +418,39 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
414
418
|
class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
415
419
|
"""DocTags-specific inline group serializer."""
|
|
416
420
|
|
|
421
|
+
def _get_inline_location_tags(
|
|
422
|
+
self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams
|
|
423
|
+
) -> SerializationResult:
|
|
424
|
+
|
|
425
|
+
prov: Optional[ProvenanceItem] = None
|
|
426
|
+
boxes: list[BoundingBox] = []
|
|
427
|
+
doc_items: list[DocItem] = []
|
|
428
|
+
for it, _ in doc.iterate_items(root=item):
|
|
429
|
+
if isinstance(it, DocItem):
|
|
430
|
+
for prov in it.prov:
|
|
431
|
+
boxes.append(prov.bbox)
|
|
432
|
+
doc_items.append(it)
|
|
433
|
+
if prov is None:
|
|
434
|
+
return create_ser_result()
|
|
435
|
+
|
|
436
|
+
bbox = BoundingBox.enclosing_bbox(boxes=boxes)
|
|
437
|
+
|
|
438
|
+
# using last seen prov as reference for page dims
|
|
439
|
+
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
440
|
+
|
|
441
|
+
loc_str = DocumentToken.get_location(
|
|
442
|
+
bbox=bbox.to_top_left_origin(page_h).as_tuple(),
|
|
443
|
+
page_w=page_w,
|
|
444
|
+
page_h=page_h,
|
|
445
|
+
xsize=params.xsize,
|
|
446
|
+
ysize=params.ysize,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
return SerializationResult(
|
|
450
|
+
text=loc_str,
|
|
451
|
+
spans=[Span(item=it) for it in doc_items],
|
|
452
|
+
)
|
|
453
|
+
|
|
417
454
|
@override
|
|
418
455
|
def serialize(
|
|
419
456
|
self,
|
|
@@ -428,12 +465,23 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
428
465
|
"""Serializes the passed item."""
|
|
429
466
|
my_visited = visited if visited is not None else set()
|
|
430
467
|
params = DocTagsParams(**kwargs)
|
|
431
|
-
parts =
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
468
|
+
parts: List[SerializationResult] = []
|
|
469
|
+
if params.add_location:
|
|
470
|
+
inline_loc_tags_ser_res = self._get_inline_location_tags(
|
|
471
|
+
doc=doc,
|
|
472
|
+
item=item,
|
|
473
|
+
params=params,
|
|
474
|
+
)
|
|
475
|
+
parts.append(inline_loc_tags_ser_res)
|
|
476
|
+
params.add_location = False # suppress children location serialization
|
|
477
|
+
parts.extend(
|
|
478
|
+
doc_serializer.get_parts(
|
|
479
|
+
item=item,
|
|
480
|
+
list_level=list_level,
|
|
481
|
+
is_inline_scope=True,
|
|
482
|
+
visited=my_visited,
|
|
483
|
+
**{**kwargs, **params.model_dump()},
|
|
484
|
+
)
|
|
437
485
|
)
|
|
438
486
|
wrap_tag = DocumentToken.INLINE.value
|
|
439
487
|
delim = _get_delim(params=params)
|
|
@@ -460,6 +508,15 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
460
508
|
return create_ser_result()
|
|
461
509
|
|
|
462
510
|
|
|
511
|
+
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
|
|
512
|
+
"""DocTags-specific annotation serializer."""
|
|
513
|
+
|
|
514
|
+
@override
|
|
515
|
+
def serialize(self, *, item: DocItem, **kwargs: Any) -> SerializationResult:
|
|
516
|
+
"""Serializes the item's annotations."""
|
|
517
|
+
return create_ser_result()
|
|
518
|
+
|
|
519
|
+
|
|
463
520
|
class DocTagsDocSerializer(DocSerializer):
|
|
464
521
|
"""DocTags-specific document serializer."""
|
|
465
522
|
|
|
@@ -473,6 +530,8 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
473
530
|
list_serializer: BaseListSerializer = DocTagsListSerializer()
|
|
474
531
|
inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
|
|
475
532
|
|
|
533
|
+
annotation_serializer: BaseAnnotationSerializer = DocTagsAnnotationSerializer()
|
|
534
|
+
|
|
476
535
|
params: DocTagsParams = DocTagsParams()
|
|
477
536
|
|
|
478
537
|
@override
|
|
@@ -21,6 +21,7 @@ from pydantic import AnyUrl, BaseModel
|
|
|
21
21
|
from typing_extensions import override
|
|
22
22
|
|
|
23
23
|
from docling_core.transforms.serializer.base import (
|
|
24
|
+
BaseAnnotationSerializer,
|
|
24
25
|
BaseDocSerializer,
|
|
25
26
|
BaseFallbackSerializer,
|
|
26
27
|
BaseFormSerializer,
|
|
@@ -35,7 +36,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
35
36
|
from docling_core.transforms.serializer.common import (
|
|
36
37
|
CommonParams,
|
|
37
38
|
DocSerializer,
|
|
38
|
-
|
|
39
|
+
_get_annotation_text,
|
|
39
40
|
create_ser_result,
|
|
40
41
|
)
|
|
41
42
|
from docling_core.transforms.serializer.html_styles import (
|
|
@@ -47,6 +48,7 @@ from docling_core.types.doc.base import ImageRefMode
|
|
|
47
48
|
from docling_core.types.doc.document import (
|
|
48
49
|
CodeItem,
|
|
49
50
|
ContentLayer,
|
|
51
|
+
DescriptionAnnotation,
|
|
50
52
|
DocItem,
|
|
51
53
|
DoclingDocument,
|
|
52
54
|
FloatingItem,
|
|
@@ -59,7 +61,9 @@ from docling_core.types.doc.document import (
|
|
|
59
61
|
ListItem,
|
|
60
62
|
NodeItem,
|
|
61
63
|
OrderedList,
|
|
64
|
+
PictureClassificationData,
|
|
62
65
|
PictureItem,
|
|
66
|
+
PictureMoleculeData,
|
|
63
67
|
PictureTabularChartData,
|
|
64
68
|
SectionHeaderItem,
|
|
65
69
|
TableCell,
|
|
@@ -758,14 +762,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
758
762
|
"""HTML-specific fallback serializer."""
|
|
759
763
|
|
|
760
764
|
@override
|
|
761
|
-
def serialize(
|
|
762
|
-
self,
|
|
763
|
-
*,
|
|
764
|
-
item: NodeItem,
|
|
765
|
-
doc_serializer: "BaseDocSerializer",
|
|
766
|
-
doc: DoclingDocument,
|
|
767
|
-
**kwargs: Any,
|
|
768
|
-
) -> SerializationResult:
|
|
765
|
+
def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
|
|
769
766
|
"""Fallback serializer for items not handled by other serializers."""
|
|
770
767
|
if isinstance(item, DocItem):
|
|
771
768
|
return create_ser_result(
|
|
@@ -777,6 +774,42 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
777
774
|
return create_ser_result()
|
|
778
775
|
|
|
779
776
|
|
|
777
|
+
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
778
|
+
"""HTML-specific annotation serializer."""
|
|
779
|
+
|
|
780
|
+
def serialize(
|
|
781
|
+
self,
|
|
782
|
+
*,
|
|
783
|
+
item: DocItem,
|
|
784
|
+
doc: DoclingDocument,
|
|
785
|
+
**kwargs: Any,
|
|
786
|
+
) -> SerializationResult:
|
|
787
|
+
"""Serializes the passed annotation to HTML format."""
|
|
788
|
+
res_parts: list[SerializationResult] = []
|
|
789
|
+
for ann in item.get_annotations():
|
|
790
|
+
if isinstance(
|
|
791
|
+
ann,
|
|
792
|
+
(PictureClassificationData, DescriptionAnnotation, PictureMoleculeData),
|
|
793
|
+
):
|
|
794
|
+
if ann_text := _get_annotation_text(ann):
|
|
795
|
+
text_dir = get_text_direction(ann_text)
|
|
796
|
+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
|
|
797
|
+
ann_ser_res = create_ser_result(
|
|
798
|
+
text=(
|
|
799
|
+
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
|
|
800
|
+
f"{html.escape(ann_text)}"
|
|
801
|
+
f"</div>"
|
|
802
|
+
),
|
|
803
|
+
span_source=item,
|
|
804
|
+
)
|
|
805
|
+
res_parts.append(ann_ser_res)
|
|
806
|
+
|
|
807
|
+
return create_ser_result(
|
|
808
|
+
text=" ".join([r.text for r in res_parts if r.text]),
|
|
809
|
+
span_source=res_parts,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
|
|
780
813
|
class HTMLDocSerializer(DocSerializer):
|
|
781
814
|
"""HTML-specific document serializer."""
|
|
782
815
|
|
|
@@ -790,6 +823,8 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
790
823
|
list_serializer: BaseListSerializer = HTMLListSerializer()
|
|
791
824
|
inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
|
|
792
825
|
|
|
826
|
+
annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
|
|
827
|
+
|
|
793
828
|
params: HTMLParams = HTMLParams()
|
|
794
829
|
|
|
795
830
|
@override
|
|
@@ -812,6 +847,16 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
812
847
|
"""Apply HTML-specific strikethrough serialization."""
|
|
813
848
|
return f"<del>{text}</del>"
|
|
814
849
|
|
|
850
|
+
@override
|
|
851
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
852
|
+
"""Apply HTML-specific subscript serialization."""
|
|
853
|
+
return f"<sub>{text}</sub>"
|
|
854
|
+
|
|
855
|
+
@override
|
|
856
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
857
|
+
"""Apply HTML-specific superscript serialization."""
|
|
858
|
+
return f"<sup>{text}</sup>"
|
|
859
|
+
|
|
815
860
|
@override
|
|
816
861
|
def serialize_hyperlink(
|
|
817
862
|
self,
|
|
@@ -968,20 +1013,13 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
968
1013
|
results.append(cap_ser_res)
|
|
969
1014
|
|
|
970
1015
|
if params.include_annotations and item.self_ref not in excluded_refs:
|
|
971
|
-
if isinstance(item, PictureItem):
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
|
|
979
|
-
f"{html.escape(ann_text)}"
|
|
980
|
-
f"</div>"
|
|
981
|
-
),
|
|
982
|
-
span_source=item,
|
|
983
|
-
)
|
|
984
|
-
results.append(ann_ser_res)
|
|
1016
|
+
if isinstance(item, (PictureItem, TableItem)):
|
|
1017
|
+
ann_res = self.serialize_annotations(
|
|
1018
|
+
item=item,
|
|
1019
|
+
**kwargs,
|
|
1020
|
+
)
|
|
1021
|
+
if ann_res.text:
|
|
1022
|
+
results.append(ann_res)
|
|
985
1023
|
|
|
986
1024
|
text_res = params.caption_delim.join([r.text for r in results])
|
|
987
1025
|
if text_res:
|
|
@@ -15,6 +15,7 @@ from tabulate import tabulate
|
|
|
15
15
|
from typing_extensions import override
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.serializer.base import (
|
|
18
|
+
BaseAnnotationSerializer,
|
|
18
19
|
BaseDocSerializer,
|
|
19
20
|
BaseFallbackSerializer,
|
|
20
21
|
BaseFormSerializer,
|
|
@@ -29,7 +30,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
29
30
|
from docling_core.transforms.serializer.common import (
|
|
30
31
|
CommonParams,
|
|
31
32
|
DocSerializer,
|
|
32
|
-
|
|
33
|
+
_get_annotation_text,
|
|
33
34
|
_PageBreakSerResult,
|
|
34
35
|
create_ser_result,
|
|
35
36
|
)
|
|
@@ -37,6 +38,7 @@ from docling_core.types.doc.base import ImageRefMode
|
|
|
37
38
|
from docling_core.types.doc.document import (
|
|
38
39
|
CodeItem,
|
|
39
40
|
ContentLayer,
|
|
41
|
+
DescriptionAnnotation,
|
|
40
42
|
DocItem,
|
|
41
43
|
DoclingDocument,
|
|
42
44
|
FloatingItem,
|
|
@@ -48,7 +50,9 @@ from docling_core.types.doc.document import (
|
|
|
48
50
|
KeyValueItem,
|
|
49
51
|
NodeItem,
|
|
50
52
|
OrderedList,
|
|
53
|
+
PictureClassificationData,
|
|
51
54
|
PictureItem,
|
|
55
|
+
PictureMoleculeData,
|
|
52
56
|
PictureTabularChartData,
|
|
53
57
|
SectionHeaderItem,
|
|
54
58
|
TableItem,
|
|
@@ -58,6 +62,23 @@ from docling_core.types.doc.document import (
|
|
|
58
62
|
)
|
|
59
63
|
|
|
60
64
|
|
|
65
|
+
def _get_annotation_ser_result(
|
|
66
|
+
ann_kind: str, ann_text: str, mark_annotation: bool, doc_item: DocItem
|
|
67
|
+
):
|
|
68
|
+
return create_ser_result(
|
|
69
|
+
text=(
|
|
70
|
+
(
|
|
71
|
+
f'<!--<annotation kind="{ann_kind}">-->'
|
|
72
|
+
f"{ann_text}"
|
|
73
|
+
f"<!--<annotation/>-->"
|
|
74
|
+
)
|
|
75
|
+
if mark_annotation
|
|
76
|
+
else ann_text
|
|
77
|
+
),
|
|
78
|
+
span_source=doc_item,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
61
82
|
class MarkdownParams(CommonParams):
|
|
62
83
|
"""Markdown-specific serialization parameters."""
|
|
63
84
|
|
|
@@ -136,6 +157,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
136
157
|
return create_ser_result(text=text, span_source=res_parts)
|
|
137
158
|
|
|
138
159
|
|
|
160
|
+
class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
161
|
+
"""Markdown-specific annotation serializer."""
|
|
162
|
+
|
|
163
|
+
def serialize(
|
|
164
|
+
self,
|
|
165
|
+
*,
|
|
166
|
+
item: DocItem,
|
|
167
|
+
doc: DoclingDocument,
|
|
168
|
+
**kwargs: Any,
|
|
169
|
+
) -> SerializationResult:
|
|
170
|
+
"""Serialize the item's annotations."""
|
|
171
|
+
params = MarkdownParams(**kwargs)
|
|
172
|
+
|
|
173
|
+
res_parts: list[SerializationResult] = []
|
|
174
|
+
for ann in item.get_annotations():
|
|
175
|
+
if isinstance(
|
|
176
|
+
ann,
|
|
177
|
+
(
|
|
178
|
+
PictureClassificationData,
|
|
179
|
+
DescriptionAnnotation,
|
|
180
|
+
PictureMoleculeData,
|
|
181
|
+
),
|
|
182
|
+
):
|
|
183
|
+
if ann_text := _get_annotation_text(ann):
|
|
184
|
+
ann_res = create_ser_result(
|
|
185
|
+
text=(
|
|
186
|
+
(
|
|
187
|
+
f'<!--<annotation kind="{ann.kind}">-->'
|
|
188
|
+
f"{ann_text}"
|
|
189
|
+
f"<!--<annotation/>-->"
|
|
190
|
+
)
|
|
191
|
+
if params.mark_annotations
|
|
192
|
+
else ann_text
|
|
193
|
+
),
|
|
194
|
+
span_source=item,
|
|
195
|
+
)
|
|
196
|
+
res_parts.append(ann_res)
|
|
197
|
+
return create_ser_result(
|
|
198
|
+
text="\n\n".join([r.text for r in res_parts if r.text]),
|
|
199
|
+
span_source=item,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
139
203
|
class MarkdownTableSerializer(BaseTableSerializer):
|
|
140
204
|
"""Markdown-specific table item serializer."""
|
|
141
205
|
|
|
@@ -149,6 +213,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
149
213
|
**kwargs: Any,
|
|
150
214
|
) -> SerializationResult:
|
|
151
215
|
"""Serializes the passed item."""
|
|
216
|
+
params = MarkdownParams(**kwargs)
|
|
152
217
|
res_parts: list[SerializationResult] = []
|
|
153
218
|
|
|
154
219
|
cap_res = doc_serializer.serialize_captions(
|
|
@@ -159,6 +224,16 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
159
224
|
res_parts.append(cap_res)
|
|
160
225
|
|
|
161
226
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
227
|
+
|
|
228
|
+
if params.include_annotations:
|
|
229
|
+
|
|
230
|
+
ann_res = doc_serializer.serialize_annotations(
|
|
231
|
+
item=item,
|
|
232
|
+
**kwargs,
|
|
233
|
+
)
|
|
234
|
+
if ann_res.text:
|
|
235
|
+
res_parts.append(ann_res)
|
|
236
|
+
|
|
162
237
|
rows = [
|
|
163
238
|
[
|
|
164
239
|
# make sure that md tables are not broken
|
|
@@ -214,22 +289,12 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
214
289
|
|
|
215
290
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
216
291
|
if params.include_annotations:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
f'<!--<annotation kind="{ann.kind}">-->'
|
|
224
|
-
f"{ann_text}"
|
|
225
|
-
f"<!--<annotation/>-->"
|
|
226
|
-
)
|
|
227
|
-
if params.mark_annotations
|
|
228
|
-
else ann_text
|
|
229
|
-
),
|
|
230
|
-
span_source=item,
|
|
231
|
-
)
|
|
232
|
-
res_parts.append(ann_ser_res)
|
|
292
|
+
ann_res = doc_serializer.serialize_annotations(
|
|
293
|
+
item=item,
|
|
294
|
+
**kwargs,
|
|
295
|
+
)
|
|
296
|
+
if ann_res.text:
|
|
297
|
+
res_parts.append(ann_res)
|
|
233
298
|
|
|
234
299
|
img_res = self._serialize_image_part(
|
|
235
300
|
item=item,
|
|
@@ -257,7 +322,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
257
322
|
res_parts.append(
|
|
258
323
|
create_ser_result(text=md_table_content, span_source=item)
|
|
259
324
|
)
|
|
260
|
-
text_res = "\n\n".join([r.text for r in res_parts])
|
|
325
|
+
text_res = "\n\n".join([r.text for r in res_parts if r.text])
|
|
261
326
|
|
|
262
327
|
return create_ser_result(text=text_res, span_source=res_parts)
|
|
263
328
|
|
|
@@ -471,6 +536,8 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
471
536
|
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
472
537
|
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
473
538
|
|
|
539
|
+
annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
|
|
540
|
+
|
|
474
541
|
params: MarkdownParams = MarkdownParams()
|
|
475
542
|
|
|
476
543
|
@override
|
|
@@ -15,7 +15,7 @@ import warnings
|
|
|
15
15
|
from enum import Enum
|
|
16
16
|
from io import BytesIO
|
|
17
17
|
from pathlib import Path
|
|
18
|
-
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
18
|
+
from typing import Any, Dict, Final, List, Literal, Optional, Sequence, Tuple, Union
|
|
19
19
|
from urllib.parse import unquote
|
|
20
20
|
|
|
21
21
|
import pandas as pd
|
|
@@ -30,6 +30,7 @@ from pydantic import (
|
|
|
30
30
|
computed_field,
|
|
31
31
|
field_validator,
|
|
32
32
|
model_validator,
|
|
33
|
+
validate_call,
|
|
33
34
|
)
|
|
34
35
|
from tabulate import tabulate
|
|
35
36
|
from typing_extensions import Annotated, Self, deprecated
|
|
@@ -53,7 +54,7 @@ _logger = logging.getLogger(__name__)
|
|
|
53
54
|
|
|
54
55
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
55
56
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
56
|
-
CURRENT_VERSION: Final = "1.
|
|
57
|
+
CURRENT_VERSION: Final = "1.4.0"
|
|
57
58
|
|
|
58
59
|
DEFAULT_EXPORT_LABELS = {
|
|
59
60
|
DocItemLabel.TITLE,
|
|
@@ -85,8 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
|
|
|
85
86
|
)
|
|
86
87
|
|
|
87
88
|
|
|
88
|
-
class
|
|
89
|
-
"""
|
|
89
|
+
class BaseAnnotation(BaseModel):
|
|
90
|
+
"""Base class for all annotation types."""
|
|
90
91
|
|
|
91
92
|
kind: str
|
|
92
93
|
|
|
@@ -98,7 +99,7 @@ class PictureClassificationClass(BaseModel):
|
|
|
98
99
|
confidence: float
|
|
99
100
|
|
|
100
101
|
|
|
101
|
-
class PictureClassificationData(
|
|
102
|
+
class PictureClassificationData(BaseAnnotation):
|
|
102
103
|
"""PictureClassificationData."""
|
|
103
104
|
|
|
104
105
|
kind: Literal["classification"] = "classification"
|
|
@@ -106,19 +107,18 @@ class PictureClassificationData(BasePictureData):
|
|
|
106
107
|
predicted_classes: List[PictureClassificationClass]
|
|
107
108
|
|
|
108
109
|
|
|
109
|
-
class
|
|
110
|
-
"""
|
|
110
|
+
class DescriptionAnnotation(BaseAnnotation):
|
|
111
|
+
"""DescriptionAnnotation."""
|
|
111
112
|
|
|
112
113
|
kind: Literal["description"] = "description"
|
|
113
114
|
text: str
|
|
114
115
|
provenance: str
|
|
115
116
|
|
|
116
117
|
|
|
117
|
-
class PictureMoleculeData(
|
|
118
|
+
class PictureMoleculeData(BaseAnnotation):
|
|
118
119
|
"""PictureMoleculeData."""
|
|
119
120
|
|
|
120
121
|
kind: Literal["molecule_data"] = "molecule_data"
|
|
121
|
-
|
|
122
122
|
smi: str
|
|
123
123
|
confidence: float
|
|
124
124
|
class_name: str
|
|
@@ -126,13 +126,19 @@ class PictureMoleculeData(BaseModel):
|
|
|
126
126
|
provenance: str
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
class
|
|
130
|
-
"""
|
|
129
|
+
class MiscAnnotation(BaseAnnotation):
|
|
130
|
+
"""MiscAnnotation."""
|
|
131
131
|
|
|
132
132
|
kind: Literal["misc"] = "misc"
|
|
133
133
|
content: Dict[str, Any]
|
|
134
134
|
|
|
135
135
|
|
|
136
|
+
# deprecated aliases:
|
|
137
|
+
BasePictureData = BaseAnnotation
|
|
138
|
+
PictureDescriptionData = DescriptionAnnotation
|
|
139
|
+
PictureMiscData = MiscAnnotation
|
|
140
|
+
|
|
141
|
+
|
|
136
142
|
class ChartLine(BaseModel):
|
|
137
143
|
"""Represents a line in a line chart.
|
|
138
144
|
|
|
@@ -196,7 +202,7 @@ class ChartPoint(BaseModel):
|
|
|
196
202
|
value: Tuple[float, float]
|
|
197
203
|
|
|
198
204
|
|
|
199
|
-
class PictureChartData(
|
|
205
|
+
class PictureChartData(BaseAnnotation):
|
|
200
206
|
"""Base class for picture chart data.
|
|
201
207
|
|
|
202
208
|
Attributes:
|
|
@@ -381,10 +387,10 @@ class PictureTabularChartData(PictureChartData):
|
|
|
381
387
|
|
|
382
388
|
PictureDataType = Annotated[
|
|
383
389
|
Union[
|
|
390
|
+
DescriptionAnnotation,
|
|
391
|
+
MiscAnnotation,
|
|
384
392
|
PictureClassificationData,
|
|
385
|
-
PictureDescriptionData,
|
|
386
393
|
PictureMoleculeData,
|
|
387
|
-
PictureMiscData,
|
|
388
394
|
PictureTabularChartData,
|
|
389
395
|
PictureLineChartData,
|
|
390
396
|
PictureBarChartData,
|
|
@@ -818,6 +824,18 @@ class DocItem(
|
|
|
818
824
|
)
|
|
819
825
|
return page_image.crop(crop_bbox.as_tuple())
|
|
820
826
|
|
|
827
|
+
def get_annotations(self) -> Sequence[BaseAnnotation]:
|
|
828
|
+
"""Get the annotations of this DocItem."""
|
|
829
|
+
return []
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
class Script(str, Enum):
|
|
833
|
+
"""Text script position."""
|
|
834
|
+
|
|
835
|
+
BASELINE = "baseline"
|
|
836
|
+
SUB = "sub"
|
|
837
|
+
SUPER = "super"
|
|
838
|
+
|
|
821
839
|
|
|
822
840
|
class Formatting(BaseModel):
|
|
823
841
|
"""Formatting."""
|
|
@@ -826,6 +844,7 @@ class Formatting(BaseModel):
|
|
|
826
844
|
italic: bool = False
|
|
827
845
|
underline: bool = False
|
|
828
846
|
strikethrough: bool = False
|
|
847
|
+
script: Script = Script.BASELINE
|
|
829
848
|
|
|
830
849
|
|
|
831
850
|
class TextItem(DocItem):
|
|
@@ -1182,6 +1201,19 @@ class PictureItem(FloatingItem):
|
|
|
1182
1201
|
text = serializer.serialize(item=self).text
|
|
1183
1202
|
return text
|
|
1184
1203
|
|
|
1204
|
+
def get_annotations(self) -> Sequence[BaseAnnotation]:
|
|
1205
|
+
"""Get the annotations of this PictureItem."""
|
|
1206
|
+
return self.annotations
|
|
1207
|
+
|
|
1208
|
+
|
|
1209
|
+
TableAnnotationType = Annotated[
|
|
1210
|
+
Union[
|
|
1211
|
+
DescriptionAnnotation,
|
|
1212
|
+
MiscAnnotation,
|
|
1213
|
+
],
|
|
1214
|
+
Field(discriminator="kind"),
|
|
1215
|
+
]
|
|
1216
|
+
|
|
1185
1217
|
|
|
1186
1218
|
class TableItem(FloatingItem):
|
|
1187
1219
|
"""TableItem."""
|
|
@@ -1192,6 +1224,8 @@ class TableItem(FloatingItem):
|
|
|
1192
1224
|
DocItemLabel.TABLE,
|
|
1193
1225
|
] = DocItemLabel.TABLE
|
|
1194
1226
|
|
|
1227
|
+
annotations: List[TableAnnotationType] = []
|
|
1228
|
+
|
|
1195
1229
|
def export_to_dataframe(self) -> pd.DataFrame:
|
|
1196
1230
|
"""Export the table as a Pandas DataFrame."""
|
|
1197
1231
|
if self.data.num_rows == 0 or self.data.num_cols == 0:
|
|
@@ -1438,6 +1472,15 @@ class TableItem(FloatingItem):
|
|
|
1438
1472
|
text = serializer.serialize(item=self).text
|
|
1439
1473
|
return text
|
|
1440
1474
|
|
|
1475
|
+
@validate_call
|
|
1476
|
+
def add_annotation(self, annotation: TableAnnotationType) -> None:
|
|
1477
|
+
"""Add an annotation to the table."""
|
|
1478
|
+
self.annotations.append(annotation)
|
|
1479
|
+
|
|
1480
|
+
def get_annotations(self) -> Sequence[BaseAnnotation]:
|
|
1481
|
+
"""Get the annotations of this TableItem."""
|
|
1482
|
+
return self.annotations
|
|
1483
|
+
|
|
1441
1484
|
|
|
1442
1485
|
class GraphCell(BaseModel):
|
|
1443
1486
|
"""GraphCell."""
|
|
@@ -1776,6 +1819,18 @@ class DoclingDocument(BaseModel):
|
|
|
1776
1819
|
item.parent = parent_ref
|
|
1777
1820
|
|
|
1778
1821
|
self.form_items.append(item)
|
|
1822
|
+
|
|
1823
|
+
elif isinstance(item, (UnorderedList, OrderedList, InlineGroup)):
|
|
1824
|
+
item_label = "groups"
|
|
1825
|
+
item_index = len(self.groups)
|
|
1826
|
+
|
|
1827
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1828
|
+
|
|
1829
|
+
item.self_ref = cref
|
|
1830
|
+
item.parent = parent_ref
|
|
1831
|
+
|
|
1832
|
+
self.groups.append(item)
|
|
1833
|
+
|
|
1779
1834
|
else:
|
|
1780
1835
|
raise ValueError(f"Item {item} is not supported for insertion")
|
|
1781
1836
|
|
|
@@ -2111,6 +2166,9 @@ class DoclingDocument(BaseModel):
|
|
|
2111
2166
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
2112
2167
|
|
|
2113
2168
|
"""
|
|
2169
|
+
if not isinstance(parent, (OrderedList, UnorderedList)):
|
|
2170
|
+
warnings.warn("ListItem's parent must be a list group.", DeprecationWarning)
|
|
2171
|
+
|
|
2114
2172
|
if not parent:
|
|
2115
2173
|
parent = self.body
|
|
2116
2174
|
|
|
@@ -2267,6 +2325,7 @@ class DoclingDocument(BaseModel):
|
|
|
2267
2325
|
parent: Optional[NodeItem] = None,
|
|
2268
2326
|
label: DocItemLabel = DocItemLabel.TABLE,
|
|
2269
2327
|
content_layer: Optional[ContentLayer] = None,
|
|
2328
|
+
annotations: Optional[list[TableAnnotationType]] = None,
|
|
2270
2329
|
):
|
|
2271
2330
|
"""add_table.
|
|
2272
2331
|
|
|
@@ -2284,7 +2343,11 @@ class DoclingDocument(BaseModel):
|
|
|
2284
2343
|
cref = f"#/tables/{table_index}"
|
|
2285
2344
|
|
|
2286
2345
|
tbl_item = TableItem(
|
|
2287
|
-
label=label,
|
|
2346
|
+
label=label,
|
|
2347
|
+
data=data,
|
|
2348
|
+
self_ref=cref,
|
|
2349
|
+
parent=parent.get_ref(),
|
|
2350
|
+
annotations=annotations or [],
|
|
2288
2351
|
)
|
|
2289
2352
|
if prov:
|
|
2290
2353
|
tbl_item.prov.append(prov)
|
|
@@ -2301,7 +2364,7 @@ class DoclingDocument(BaseModel):
|
|
|
2301
2364
|
|
|
2302
2365
|
def add_picture(
|
|
2303
2366
|
self,
|
|
2304
|
-
annotations: List[PictureDataType] =
|
|
2367
|
+
annotations: Optional[List[PictureDataType]] = None,
|
|
2305
2368
|
image: Optional[ImageRef] = None,
|
|
2306
2369
|
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
2307
2370
|
prov: Optional[ProvenanceItem] = None,
|
|
@@ -2310,7 +2373,7 @@ class DoclingDocument(BaseModel):
|
|
|
2310
2373
|
):
|
|
2311
2374
|
"""add_picture.
|
|
2312
2375
|
|
|
2313
|
-
:param data: List[PictureData]: (Default value =
|
|
2376
|
+
:param data: Optional[List[PictureData]]: (Default value = None)
|
|
2314
2377
|
:param caption: Optional[Union[TextItem:
|
|
2315
2378
|
:param RefItem]]: (Default value = None)
|
|
2316
2379
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
@@ -2324,7 +2387,7 @@ class DoclingDocument(BaseModel):
|
|
|
2324
2387
|
|
|
2325
2388
|
fig_item = PictureItem(
|
|
2326
2389
|
label=DocItemLabel.PICTURE,
|
|
2327
|
-
annotations=annotations,
|
|
2390
|
+
annotations=annotations or [],
|
|
2328
2391
|
image=image,
|
|
2329
2392
|
self_ref=cref,
|
|
2330
2393
|
parent=parent.get_ref(),
|
|
@@ -3589,6 +3652,52 @@ class DoclingDocument(BaseModel):
|
|
|
3589
3652
|
|
|
3590
3653
|
return (GraphData(cells=cells, links=links), overall_prov)
|
|
3591
3654
|
|
|
3655
|
+
def _add_text(
|
|
3656
|
+
full_chunk: str,
|
|
3657
|
+
bbox: Optional[BoundingBox],
|
|
3658
|
+
pg_width: int,
|
|
3659
|
+
pg_height: int,
|
|
3660
|
+
page_no: int,
|
|
3661
|
+
tag_name: str,
|
|
3662
|
+
doc_label: DocItemLabel,
|
|
3663
|
+
doc: DoclingDocument,
|
|
3664
|
+
parent: Optional[NodeItem],
|
|
3665
|
+
):
|
|
3666
|
+
# For everything else, treat as text
|
|
3667
|
+
text_content = extract_inner_text(full_chunk)
|
|
3668
|
+
element_prov = (
|
|
3669
|
+
ProvenanceItem(
|
|
3670
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3671
|
+
charspan=(0, len(text_content)),
|
|
3672
|
+
page_no=page_no,
|
|
3673
|
+
)
|
|
3674
|
+
if bbox
|
|
3675
|
+
else None
|
|
3676
|
+
)
|
|
3677
|
+
|
|
3678
|
+
content_layer = ContentLayer.BODY
|
|
3679
|
+
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
3680
|
+
content_layer = ContentLayer.FURNITURE
|
|
3681
|
+
|
|
3682
|
+
if doc_label == DocItemLabel.SECTION_HEADER:
|
|
3683
|
+
# Extract level from tag_name (e.g. "section_level_header_1" -> 1)
|
|
3684
|
+
level = int(tag_name.split("_")[-1])
|
|
3685
|
+
doc.add_heading(
|
|
3686
|
+
text=text_content,
|
|
3687
|
+
level=level,
|
|
3688
|
+
prov=element_prov,
|
|
3689
|
+
parent=parent,
|
|
3690
|
+
content_layer=content_layer,
|
|
3691
|
+
)
|
|
3692
|
+
else:
|
|
3693
|
+
doc.add_text(
|
|
3694
|
+
label=doc_label,
|
|
3695
|
+
text=text_content,
|
|
3696
|
+
prov=element_prov,
|
|
3697
|
+
parent=parent,
|
|
3698
|
+
content_layer=content_layer,
|
|
3699
|
+
)
|
|
3700
|
+
|
|
3592
3701
|
# doc = DoclingDocument(name="Document")
|
|
3593
3702
|
for pg_idx, doctag_page in enumerate(doctag_document.pages):
|
|
3594
3703
|
page_doctags = doctag_page.tokens
|
|
@@ -3623,7 +3732,7 @@ class DoclingDocument(BaseModel):
|
|
|
3623
3732
|
tag_pattern = (
|
|
3624
3733
|
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
|
3625
3734
|
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
|
3626
|
-
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
|
3735
|
+
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|{GroupLabel.INLINE}|"
|
|
3627
3736
|
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
|
3628
3737
|
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
|
3629
3738
|
rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
|
@@ -3648,7 +3757,7 @@ class DoclingDocument(BaseModel):
|
|
|
3648
3757
|
# no closing tag; only the existence of the item is recovered
|
|
3649
3758
|
full_chunk = f"<{tag_name}></{tag_name}>"
|
|
3650
3759
|
|
|
3651
|
-
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.
|
|
3760
|
+
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
|
|
3652
3761
|
|
|
3653
3762
|
if tag_name == DocumentToken.OTSL.value:
|
|
3654
3763
|
table_data = parse_table_content(full_chunk)
|
|
@@ -3671,6 +3780,24 @@ class DoclingDocument(BaseModel):
|
|
|
3671
3780
|
else:
|
|
3672
3781
|
doc.add_table(data=table_data, caption=caption)
|
|
3673
3782
|
|
|
3783
|
+
elif tag_name == GroupLabel.INLINE:
|
|
3784
|
+
inline_group = doc.add_inline_group()
|
|
3785
|
+
content = match.group("content")
|
|
3786
|
+
common_bbox = extract_bounding_box(content)
|
|
3787
|
+
for item_match in pattern.finditer(content):
|
|
3788
|
+
item_tag = item_match.group("tag")
|
|
3789
|
+
_add_text(
|
|
3790
|
+
full_chunk=item_match.group(0),
|
|
3791
|
+
bbox=common_bbox,
|
|
3792
|
+
pg_width=pg_width,
|
|
3793
|
+
pg_height=pg_height,
|
|
3794
|
+
page_no=page_no,
|
|
3795
|
+
tag_name=item_tag,
|
|
3796
|
+
doc_label=tag_to_doclabel.get(item_tag, DocItemLabel.TEXT),
|
|
3797
|
+
doc=doc,
|
|
3798
|
+
parent=inline_group,
|
|
3799
|
+
)
|
|
3800
|
+
|
|
3674
3801
|
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
|
|
3675
3802
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
3676
3803
|
table_data = None
|
|
@@ -3820,38 +3947,17 @@ class DoclingDocument(BaseModel):
|
|
|
3820
3947
|
)
|
|
3821
3948
|
else:
|
|
3822
3949
|
# For everything else, treat as text
|
|
3823
|
-
|
|
3824
|
-
|
|
3825
|
-
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
|
|
3950
|
+
_add_text(
|
|
3951
|
+
full_chunk=full_chunk,
|
|
3952
|
+
bbox=bbox,
|
|
3953
|
+
pg_width=pg_width,
|
|
3954
|
+
pg_height=pg_height,
|
|
3955
|
+
page_no=page_no,
|
|
3956
|
+
tag_name=tag_name,
|
|
3957
|
+
doc_label=doc_label,
|
|
3958
|
+
doc=doc,
|
|
3959
|
+
parent=None,
|
|
3832
3960
|
)
|
|
3833
|
-
|
|
3834
|
-
content_layer = ContentLayer.BODY
|
|
3835
|
-
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
3836
|
-
content_layer = ContentLayer.FURNITURE
|
|
3837
|
-
|
|
3838
|
-
if doc_label == DocItemLabel.SECTION_HEADER:
|
|
3839
|
-
# Extract level from tag_name (e.g. "section_level_header_1" -> 1)
|
|
3840
|
-
level = int(tag_name.split("_")[-1])
|
|
3841
|
-
doc.add_heading(
|
|
3842
|
-
text=text_content,
|
|
3843
|
-
level=level,
|
|
3844
|
-
prov=element_prov,
|
|
3845
|
-
content_layer=content_layer,
|
|
3846
|
-
)
|
|
3847
|
-
else:
|
|
3848
|
-
doc.add_text(
|
|
3849
|
-
label=doc_label,
|
|
3850
|
-
text=text_content,
|
|
3851
|
-
prov=element_prov,
|
|
3852
|
-
content_layer=content_layer,
|
|
3853
|
-
)
|
|
3854
|
-
|
|
3855
3961
|
return doc
|
|
3856
3962
|
|
|
3857
3963
|
@deprecated("Use save_as_doctags instead.")
|
|
@@ -4149,3 +4255,59 @@ class DoclingDocument(BaseModel):
|
|
|
4149
4255
|
raise ValueError("Document hierachy is inconsistent.")
|
|
4150
4256
|
|
|
4151
4257
|
return d
|
|
4258
|
+
|
|
4259
|
+
@model_validator(mode="after")
|
|
4260
|
+
def validate_misplaced_list_items(self):
|
|
4261
|
+
"""validate_misplaced_list_items."""
|
|
4262
|
+
# find list items without list parent, putting succesive ones together
|
|
4263
|
+
misplaced_list_items: list[list[ListItem]] = []
|
|
4264
|
+
prev: Optional[NodeItem] = None
|
|
4265
|
+
for item, _ in self.iterate_items(
|
|
4266
|
+
traverse_pictures=True,
|
|
4267
|
+
included_content_layers={c for c in ContentLayer},
|
|
4268
|
+
with_groups=True, # so that we can distinguish neighboring lists
|
|
4269
|
+
):
|
|
4270
|
+
if isinstance(item, ListItem) and (
|
|
4271
|
+
item.parent is None
|
|
4272
|
+
or not isinstance(
|
|
4273
|
+
item.parent.resolve(doc=self), (OrderedList, UnorderedList)
|
|
4274
|
+
)
|
|
4275
|
+
):
|
|
4276
|
+
if isinstance(prev, ListItem) and (
|
|
4277
|
+
prev.parent is None or prev.parent.resolve(self) == self.body
|
|
4278
|
+
): # case of continuing list
|
|
4279
|
+
misplaced_list_items[-1].append(item)
|
|
4280
|
+
else: # case of new list
|
|
4281
|
+
misplaced_list_items.append([item])
|
|
4282
|
+
prev = item
|
|
4283
|
+
|
|
4284
|
+
for curr_list_items in reversed(misplaced_list_items):
|
|
4285
|
+
|
|
4286
|
+
# add group
|
|
4287
|
+
new_group = (
|
|
4288
|
+
OrderedList(self_ref="#")
|
|
4289
|
+
if curr_list_items[0].enumerated
|
|
4290
|
+
else UnorderedList(self_ref="#")
|
|
4291
|
+
)
|
|
4292
|
+
self.insert_item_before_sibling(
|
|
4293
|
+
new_item=new_group,
|
|
4294
|
+
sibling=curr_list_items[0],
|
|
4295
|
+
)
|
|
4296
|
+
|
|
4297
|
+
# delete list items from document (should not be affected by group addition)
|
|
4298
|
+
self.delete_items(node_items=curr_list_items)
|
|
4299
|
+
|
|
4300
|
+
# add list items to new group
|
|
4301
|
+
for li in curr_list_items:
|
|
4302
|
+
self.add_list_item(
|
|
4303
|
+
text=li.text,
|
|
4304
|
+
enumerated=li.enumerated,
|
|
4305
|
+
marker=li.marker,
|
|
4306
|
+
orig=li.orig,
|
|
4307
|
+
prov=li.prov[0] if li.prov else None,
|
|
4308
|
+
parent=new_group,
|
|
4309
|
+
content_layer=li.content_layer,
|
|
4310
|
+
formatting=li.formatting,
|
|
4311
|
+
hyperlink=li.hyperlink,
|
|
4312
|
+
)
|
|
4313
|
+
return self
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.34.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -26,12 +26,12 @@ docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP
|
|
|
26
26
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
|
|
27
27
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
28
28
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
29
|
-
docling_core/transforms/serializer/base.py,sha256=
|
|
30
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
31
|
-
docling_core/transforms/serializer/doctags.py,sha256=
|
|
32
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
29
|
+
docling_core/transforms/serializer/base.py,sha256=ZFIiZeplL-QbBs9EDUb1awqxapQ23PsApVetJtAs7Vs,6891
|
|
30
|
+
docling_core/transforms/serializer/common.py,sha256=WP-qO-woidrKyvZ56m0vlKMysoLrMzzZtHSCIwsl3ek,19119
|
|
31
|
+
docling_core/transforms/serializer/doctags.py,sha256=PuAExlP-2HxcDSP_R_phtYQU0yKBW94RrPgb85IUxck,19905
|
|
32
|
+
docling_core/transforms/serializer/html.py,sha256=KiywrroYBS3yk07gQizlmk3oqkXg_NpFwE0VF31_Z-I,37112
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
34
|
+
docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
36
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
37
|
docling_core/transforms/visualizer/layout_visualizer.py,sha256=ulXxWGIl69-HMKDPFk_XKgNCgQeDNc969PVt_X0-drA,7823
|
|
@@ -40,7 +40,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
40
40
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
41
41
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
42
42
|
docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
|
|
43
|
-
docling_core/types/doc/document.py,sha256=
|
|
43
|
+
docling_core/types/doc/document.py,sha256=hQ4eXNjqbAQ2Tklr2RM7Xy0vZLwdoTymSQuJGskSOEw,148336
|
|
44
44
|
docling_core/types/doc/labels.py,sha256=vp4h3e7AmBvezRmgrfuPehjAHTZOufphErLB4ENhdME,7171
|
|
45
45
|
docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
|
|
46
46
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -73,9 +73,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
73
73
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
74
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
75
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
76
|
-
docling_core-2.
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
76
|
+
docling_core-2.34.1.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.34.1.dist-info/METADATA,sha256=CnJF6fq7wPKEcfPhihxBZP0QFYf1Y5TkVKCHflpKKw0,6453
|
|
78
|
+
docling_core-2.34.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
79
|
+
docling_core-2.34.1.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
80
|
+
docling_core-2.34.1.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
81
|
+
docling_core-2.34.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|