docling-core 2.33.1__py3-none-any.whl → 2.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/base.py +34 -0
- docling_core/transforms/serializer/common.py +37 -3
- docling_core/transforms/serializer/doctags.py +65 -6
- docling_core/transforms/serializer/html.py +61 -23
- docling_core/transforms/serializer/markdown.py +85 -18
- docling_core/types/doc/document.py +211 -53
- {docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/METADATA +1 -1
- {docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/RECORD +12 -12
- {docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/WHEEL +0 -0
- {docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/top_level.txt +0 -0
|
@@ -202,6 +202,16 @@ class BaseDocSerializer(ABC):
|
|
|
202
202
|
"""Hook for strikethrough formatting serialization."""
|
|
203
203
|
...
|
|
204
204
|
|
|
205
|
+
@abstractmethod
|
|
206
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
207
|
+
"""Hook for subscript formatting serialization."""
|
|
208
|
+
...
|
|
209
|
+
|
|
210
|
+
@abstractmethod
|
|
211
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
212
|
+
"""Hook for superscript formatting serialization."""
|
|
213
|
+
...
|
|
214
|
+
|
|
205
215
|
@abstractmethod
|
|
206
216
|
def serialize_hyperlink(
|
|
207
217
|
self,
|
|
@@ -239,6 +249,15 @@ class BaseDocSerializer(ABC):
|
|
|
239
249
|
"""Serialize the item's captions."""
|
|
240
250
|
...
|
|
241
251
|
|
|
252
|
+
@abstractmethod
|
|
253
|
+
def serialize_annotations(
|
|
254
|
+
self,
|
|
255
|
+
item: DocItem,
|
|
256
|
+
**kwargs: Any,
|
|
257
|
+
) -> SerializationResult:
|
|
258
|
+
"""Serialize the item's annotations."""
|
|
259
|
+
...
|
|
260
|
+
|
|
242
261
|
@abstractmethod
|
|
243
262
|
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
244
263
|
"""Get references to excluded items."""
|
|
@@ -257,3 +276,18 @@ class BaseSerializerProvider(ABC):
|
|
|
257
276
|
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
|
|
258
277
|
"""Get a the associated serializer."""
|
|
259
278
|
...
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class BaseAnnotationSerializer(ABC):
|
|
282
|
+
"""Base class for annotation serializers."""
|
|
283
|
+
|
|
284
|
+
@abstractmethod
|
|
285
|
+
def serialize(
|
|
286
|
+
self,
|
|
287
|
+
*,
|
|
288
|
+
item: DocItem,
|
|
289
|
+
doc: DoclingDocument,
|
|
290
|
+
**kwargs: Any,
|
|
291
|
+
) -> SerializationResult:
|
|
292
|
+
"""Serializes the passed annotation."""
|
|
293
|
+
...
|
|
@@ -15,6 +15,7 @@ from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_fie
|
|
|
15
15
|
from typing_extensions import Self, override
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.serializer.base import (
|
|
18
|
+
BaseAnnotationSerializer,
|
|
18
19
|
BaseDocSerializer,
|
|
19
20
|
BaseFallbackSerializer,
|
|
20
21
|
BaseFormSerializer,
|
|
@@ -30,6 +31,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
30
31
|
from docling_core.types.doc.document import (
|
|
31
32
|
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
32
33
|
ContentLayer,
|
|
34
|
+
DescriptionAnnotation,
|
|
33
35
|
DocItem,
|
|
34
36
|
DoclingDocument,
|
|
35
37
|
FloatingItem,
|
|
@@ -41,9 +43,10 @@ from docling_core.types.doc.document import (
|
|
|
41
43
|
OrderedList,
|
|
42
44
|
PictureClassificationData,
|
|
43
45
|
PictureDataType,
|
|
44
|
-
PictureDescriptionData,
|
|
45
46
|
PictureItem,
|
|
46
47
|
PictureMoleculeData,
|
|
48
|
+
Script,
|
|
49
|
+
TableAnnotationType,
|
|
47
50
|
TableItem,
|
|
48
51
|
TextItem,
|
|
49
52
|
UnorderedList,
|
|
@@ -122,7 +125,9 @@ def _iterate_items(
|
|
|
122
125
|
yield item
|
|
123
126
|
|
|
124
127
|
|
|
125
|
-
def
|
|
128
|
+
def _get_annotation_text(
|
|
129
|
+
annotation: Union[PictureDataType, TableAnnotationType],
|
|
130
|
+
) -> Optional[str]:
|
|
126
131
|
result = None
|
|
127
132
|
if isinstance(annotation, PictureClassificationData):
|
|
128
133
|
predicted_class = (
|
|
@@ -132,7 +137,7 @@ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
|
|
|
132
137
|
)
|
|
133
138
|
if predicted_class is not None:
|
|
134
139
|
result = predicted_class.replace("_", " ")
|
|
135
|
-
elif isinstance(annotation,
|
|
140
|
+
elif isinstance(annotation, DescriptionAnnotation):
|
|
136
141
|
result = annotation.text
|
|
137
142
|
elif isinstance(annotation, PictureMoleculeData):
|
|
138
143
|
result = annotation.smi
|
|
@@ -211,6 +216,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
211
216
|
list_serializer: BaseListSerializer
|
|
212
217
|
inline_serializer: BaseInlineSerializer
|
|
213
218
|
|
|
219
|
+
annotation_serializer: BaseAnnotationSerializer
|
|
220
|
+
|
|
214
221
|
params: CommonParams = CommonParams()
|
|
215
222
|
|
|
216
223
|
_excluded_refs_cache: dict[str, set[str]] = {}
|
|
@@ -449,6 +456,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
449
456
|
res = self.serialize_underline(text=res)
|
|
450
457
|
if formatting.strikethrough:
|
|
451
458
|
res = self.serialize_strikethrough(text=res)
|
|
459
|
+
if formatting.script == Script.SUB:
|
|
460
|
+
res = self.serialize_subscript(text=res)
|
|
461
|
+
elif formatting.script == Script.SUPER:
|
|
462
|
+
res = self.serialize_superscript(text=res)
|
|
452
463
|
if params.include_hyperlinks and hyperlink:
|
|
453
464
|
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
|
|
454
465
|
return res
|
|
@@ -473,6 +484,16 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
473
484
|
"""Hook for strikethrough formatting serialization."""
|
|
474
485
|
return text
|
|
475
486
|
|
|
487
|
+
@override
|
|
488
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
489
|
+
"""Hook for subscript formatting serialization."""
|
|
490
|
+
return text
|
|
491
|
+
|
|
492
|
+
@override
|
|
493
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
494
|
+
"""Hook for superscript formatting serialization."""
|
|
495
|
+
return text
|
|
496
|
+
|
|
476
497
|
@override
|
|
477
498
|
def serialize_hyperlink(
|
|
478
499
|
self,
|
|
@@ -505,6 +526,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
505
526
|
text_res = ""
|
|
506
527
|
return create_ser_result(text=text_res, span_source=results)
|
|
507
528
|
|
|
529
|
+
@override
|
|
530
|
+
def serialize_annotations(
|
|
531
|
+
self,
|
|
532
|
+
item: DocItem,
|
|
533
|
+
**kwargs: Any,
|
|
534
|
+
) -> SerializationResult:
|
|
535
|
+
"""Serialize the item's annotations."""
|
|
536
|
+
return self.annotation_serializer.serialize(
|
|
537
|
+
item=item,
|
|
538
|
+
doc=self.doc,
|
|
539
|
+
**kwargs,
|
|
540
|
+
)
|
|
541
|
+
|
|
508
542
|
def _get_applicable_pages(self) -> Optional[list[int]]:
|
|
509
543
|
pages = {
|
|
510
544
|
item.prov[0].page_no: ...
|
|
@@ -7,6 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
9
|
from docling_core.transforms.serializer.base import (
|
|
10
|
+
BaseAnnotationSerializer,
|
|
10
11
|
BaseDocSerializer,
|
|
11
12
|
BaseFallbackSerializer,
|
|
12
13
|
BaseFormSerializer,
|
|
@@ -17,12 +18,14 @@ from docling_core.transforms.serializer.base import (
|
|
|
17
18
|
BaseTableSerializer,
|
|
18
19
|
BaseTextSerializer,
|
|
19
20
|
SerializationResult,
|
|
21
|
+
Span,
|
|
20
22
|
)
|
|
21
23
|
from docling_core.transforms.serializer.common import (
|
|
22
24
|
CommonParams,
|
|
23
25
|
DocSerializer,
|
|
24
26
|
create_ser_result,
|
|
25
27
|
)
|
|
28
|
+
from docling_core.types.doc.base import BoundingBox
|
|
26
29
|
from docling_core.types.doc.document import (
|
|
27
30
|
CodeItem,
|
|
28
31
|
DocItem,
|
|
@@ -38,6 +41,7 @@ from docling_core.types.doc.document import (
|
|
|
38
41
|
PictureItem,
|
|
39
42
|
PictureMoleculeData,
|
|
40
43
|
PictureTabularChartData,
|
|
44
|
+
ProvenanceItem,
|
|
41
45
|
TableItem,
|
|
42
46
|
TextItem,
|
|
43
47
|
UnorderedList,
|
|
@@ -414,6 +418,39 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
414
418
|
class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
415
419
|
"""DocTags-specific inline group serializer."""
|
|
416
420
|
|
|
421
|
+
def _get_inline_location_tags(
|
|
422
|
+
self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams
|
|
423
|
+
) -> SerializationResult:
|
|
424
|
+
|
|
425
|
+
prov: Optional[ProvenanceItem] = None
|
|
426
|
+
boxes: list[BoundingBox] = []
|
|
427
|
+
doc_items: list[DocItem] = []
|
|
428
|
+
for it, _ in doc.iterate_items(root=item):
|
|
429
|
+
if isinstance(it, DocItem):
|
|
430
|
+
for prov in it.prov:
|
|
431
|
+
boxes.append(prov.bbox)
|
|
432
|
+
doc_items.append(it)
|
|
433
|
+
if prov is None:
|
|
434
|
+
return create_ser_result()
|
|
435
|
+
|
|
436
|
+
bbox = BoundingBox.enclosing_bbox(boxes=boxes)
|
|
437
|
+
|
|
438
|
+
# using last seen prov as reference for page dims
|
|
439
|
+
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
440
|
+
|
|
441
|
+
loc_str = DocumentToken.get_location(
|
|
442
|
+
bbox=bbox.to_top_left_origin(page_h).as_tuple(),
|
|
443
|
+
page_w=page_w,
|
|
444
|
+
page_h=page_h,
|
|
445
|
+
xsize=params.xsize,
|
|
446
|
+
ysize=params.ysize,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
return SerializationResult(
|
|
450
|
+
text=loc_str,
|
|
451
|
+
spans=[Span(item=it) for it in doc_items],
|
|
452
|
+
)
|
|
453
|
+
|
|
417
454
|
@override
|
|
418
455
|
def serialize(
|
|
419
456
|
self,
|
|
@@ -428,12 +465,23 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
428
465
|
"""Serializes the passed item."""
|
|
429
466
|
my_visited = visited if visited is not None else set()
|
|
430
467
|
params = DocTagsParams(**kwargs)
|
|
431
|
-
parts =
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
468
|
+
parts: List[SerializationResult] = []
|
|
469
|
+
if params.add_location:
|
|
470
|
+
inline_loc_tags_ser_res = self._get_inline_location_tags(
|
|
471
|
+
doc=doc,
|
|
472
|
+
item=item,
|
|
473
|
+
params=params,
|
|
474
|
+
)
|
|
475
|
+
parts.append(inline_loc_tags_ser_res)
|
|
476
|
+
params.add_location = False # suppress children location serialization
|
|
477
|
+
parts.extend(
|
|
478
|
+
doc_serializer.get_parts(
|
|
479
|
+
item=item,
|
|
480
|
+
list_level=list_level,
|
|
481
|
+
is_inline_scope=True,
|
|
482
|
+
visited=my_visited,
|
|
483
|
+
**{**kwargs, **params.model_dump()},
|
|
484
|
+
)
|
|
437
485
|
)
|
|
438
486
|
wrap_tag = DocumentToken.INLINE.value
|
|
439
487
|
delim = _get_delim(params=params)
|
|
@@ -460,6 +508,15 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
460
508
|
return create_ser_result()
|
|
461
509
|
|
|
462
510
|
|
|
511
|
+
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
|
|
512
|
+
"""DocTags-specific annotation serializer."""
|
|
513
|
+
|
|
514
|
+
@override
|
|
515
|
+
def serialize(self, *, item: DocItem, **kwargs: Any) -> SerializationResult:
|
|
516
|
+
"""Serializes the item's annotations."""
|
|
517
|
+
return create_ser_result()
|
|
518
|
+
|
|
519
|
+
|
|
463
520
|
class DocTagsDocSerializer(DocSerializer):
|
|
464
521
|
"""DocTags-specific document serializer."""
|
|
465
522
|
|
|
@@ -473,6 +530,8 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
473
530
|
list_serializer: BaseListSerializer = DocTagsListSerializer()
|
|
474
531
|
inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
|
|
475
532
|
|
|
533
|
+
annotation_serializer: BaseAnnotationSerializer = DocTagsAnnotationSerializer()
|
|
534
|
+
|
|
476
535
|
params: DocTagsParams = DocTagsParams()
|
|
477
536
|
|
|
478
537
|
@override
|
|
@@ -21,6 +21,7 @@ from pydantic import AnyUrl, BaseModel
|
|
|
21
21
|
from typing_extensions import override
|
|
22
22
|
|
|
23
23
|
from docling_core.transforms.serializer.base import (
|
|
24
|
+
BaseAnnotationSerializer,
|
|
24
25
|
BaseDocSerializer,
|
|
25
26
|
BaseFallbackSerializer,
|
|
26
27
|
BaseFormSerializer,
|
|
@@ -35,7 +36,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
35
36
|
from docling_core.transforms.serializer.common import (
|
|
36
37
|
CommonParams,
|
|
37
38
|
DocSerializer,
|
|
38
|
-
|
|
39
|
+
_get_annotation_text,
|
|
39
40
|
create_ser_result,
|
|
40
41
|
)
|
|
41
42
|
from docling_core.transforms.serializer.html_styles import (
|
|
@@ -47,6 +48,7 @@ from docling_core.types.doc.base import ImageRefMode
|
|
|
47
48
|
from docling_core.types.doc.document import (
|
|
48
49
|
CodeItem,
|
|
49
50
|
ContentLayer,
|
|
51
|
+
DescriptionAnnotation,
|
|
50
52
|
DocItem,
|
|
51
53
|
DoclingDocument,
|
|
52
54
|
FloatingItem,
|
|
@@ -59,7 +61,9 @@ from docling_core.types.doc.document import (
|
|
|
59
61
|
ListItem,
|
|
60
62
|
NodeItem,
|
|
61
63
|
OrderedList,
|
|
64
|
+
PictureClassificationData,
|
|
62
65
|
PictureItem,
|
|
66
|
+
PictureMoleculeData,
|
|
63
67
|
PictureTabularChartData,
|
|
64
68
|
SectionHeaderItem,
|
|
65
69
|
TableCell,
|
|
@@ -758,14 +762,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
758
762
|
"""HTML-specific fallback serializer."""
|
|
759
763
|
|
|
760
764
|
@override
|
|
761
|
-
def serialize(
|
|
762
|
-
self,
|
|
763
|
-
*,
|
|
764
|
-
item: NodeItem,
|
|
765
|
-
doc_serializer: "BaseDocSerializer",
|
|
766
|
-
doc: DoclingDocument,
|
|
767
|
-
**kwargs: Any,
|
|
768
|
-
) -> SerializationResult:
|
|
765
|
+
def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
|
|
769
766
|
"""Fallback serializer for items not handled by other serializers."""
|
|
770
767
|
if isinstance(item, DocItem):
|
|
771
768
|
return create_ser_result(
|
|
@@ -777,6 +774,42 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
777
774
|
return create_ser_result()
|
|
778
775
|
|
|
779
776
|
|
|
777
|
+
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
778
|
+
"""HTML-specific annotation serializer."""
|
|
779
|
+
|
|
780
|
+
def serialize(
|
|
781
|
+
self,
|
|
782
|
+
*,
|
|
783
|
+
item: DocItem,
|
|
784
|
+
doc: DoclingDocument,
|
|
785
|
+
**kwargs: Any,
|
|
786
|
+
) -> SerializationResult:
|
|
787
|
+
"""Serializes the passed annotation to HTML format."""
|
|
788
|
+
res_parts: list[SerializationResult] = []
|
|
789
|
+
for ann in item.get_annotations():
|
|
790
|
+
if isinstance(
|
|
791
|
+
ann,
|
|
792
|
+
(PictureClassificationData, DescriptionAnnotation, PictureMoleculeData),
|
|
793
|
+
):
|
|
794
|
+
if ann_text := _get_annotation_text(ann):
|
|
795
|
+
text_dir = get_text_direction(ann_text)
|
|
796
|
+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
|
|
797
|
+
ann_ser_res = create_ser_result(
|
|
798
|
+
text=(
|
|
799
|
+
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
|
|
800
|
+
f"{html.escape(ann_text)}"
|
|
801
|
+
f"</div>"
|
|
802
|
+
),
|
|
803
|
+
span_source=item,
|
|
804
|
+
)
|
|
805
|
+
res_parts.append(ann_ser_res)
|
|
806
|
+
|
|
807
|
+
return create_ser_result(
|
|
808
|
+
text=" ".join([r.text for r in res_parts if r.text]),
|
|
809
|
+
span_source=res_parts,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
|
|
780
813
|
class HTMLDocSerializer(DocSerializer):
|
|
781
814
|
"""HTML-specific document serializer."""
|
|
782
815
|
|
|
@@ -790,6 +823,8 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
790
823
|
list_serializer: BaseListSerializer = HTMLListSerializer()
|
|
791
824
|
inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
|
|
792
825
|
|
|
826
|
+
annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
|
|
827
|
+
|
|
793
828
|
params: HTMLParams = HTMLParams()
|
|
794
829
|
|
|
795
830
|
@override
|
|
@@ -812,6 +847,16 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
812
847
|
"""Apply HTML-specific strikethrough serialization."""
|
|
813
848
|
return f"<del>{text}</del>"
|
|
814
849
|
|
|
850
|
+
@override
|
|
851
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
852
|
+
"""Apply HTML-specific subscript serialization."""
|
|
853
|
+
return f"<sub>{text}</sub>"
|
|
854
|
+
|
|
855
|
+
@override
|
|
856
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
857
|
+
"""Apply HTML-specific superscript serialization."""
|
|
858
|
+
return f"<sup>{text}</sup>"
|
|
859
|
+
|
|
815
860
|
@override
|
|
816
861
|
def serialize_hyperlink(
|
|
817
862
|
self,
|
|
@@ -968,20 +1013,13 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
968
1013
|
results.append(cap_ser_res)
|
|
969
1014
|
|
|
970
1015
|
if params.include_annotations and item.self_ref not in excluded_refs:
|
|
971
|
-
if isinstance(item, PictureItem):
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
|
|
979
|
-
f"{html.escape(ann_text)}"
|
|
980
|
-
f"</div>"
|
|
981
|
-
),
|
|
982
|
-
span_source=item,
|
|
983
|
-
)
|
|
984
|
-
results.append(ann_ser_res)
|
|
1016
|
+
if isinstance(item, (PictureItem, TableItem)):
|
|
1017
|
+
ann_res = self.serialize_annotations(
|
|
1018
|
+
item=item,
|
|
1019
|
+
**kwargs,
|
|
1020
|
+
)
|
|
1021
|
+
if ann_res.text:
|
|
1022
|
+
results.append(ann_res)
|
|
985
1023
|
|
|
986
1024
|
text_res = params.caption_delim.join([r.text for r in results])
|
|
987
1025
|
if text_res:
|
|
@@ -15,6 +15,7 @@ from tabulate import tabulate
|
|
|
15
15
|
from typing_extensions import override
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.serializer.base import (
|
|
18
|
+
BaseAnnotationSerializer,
|
|
18
19
|
BaseDocSerializer,
|
|
19
20
|
BaseFallbackSerializer,
|
|
20
21
|
BaseFormSerializer,
|
|
@@ -29,7 +30,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
29
30
|
from docling_core.transforms.serializer.common import (
|
|
30
31
|
CommonParams,
|
|
31
32
|
DocSerializer,
|
|
32
|
-
|
|
33
|
+
_get_annotation_text,
|
|
33
34
|
_PageBreakSerResult,
|
|
34
35
|
create_ser_result,
|
|
35
36
|
)
|
|
@@ -37,6 +38,7 @@ from docling_core.types.doc.base import ImageRefMode
|
|
|
37
38
|
from docling_core.types.doc.document import (
|
|
38
39
|
CodeItem,
|
|
39
40
|
ContentLayer,
|
|
41
|
+
DescriptionAnnotation,
|
|
40
42
|
DocItem,
|
|
41
43
|
DoclingDocument,
|
|
42
44
|
FloatingItem,
|
|
@@ -48,7 +50,9 @@ from docling_core.types.doc.document import (
|
|
|
48
50
|
KeyValueItem,
|
|
49
51
|
NodeItem,
|
|
50
52
|
OrderedList,
|
|
53
|
+
PictureClassificationData,
|
|
51
54
|
PictureItem,
|
|
55
|
+
PictureMoleculeData,
|
|
52
56
|
PictureTabularChartData,
|
|
53
57
|
SectionHeaderItem,
|
|
54
58
|
TableItem,
|
|
@@ -58,6 +62,23 @@ from docling_core.types.doc.document import (
|
|
|
58
62
|
)
|
|
59
63
|
|
|
60
64
|
|
|
65
|
+
def _get_annotation_ser_result(
|
|
66
|
+
ann_kind: str, ann_text: str, mark_annotation: bool, doc_item: DocItem
|
|
67
|
+
):
|
|
68
|
+
return create_ser_result(
|
|
69
|
+
text=(
|
|
70
|
+
(
|
|
71
|
+
f'<!--<annotation kind="{ann_kind}">-->'
|
|
72
|
+
f"{ann_text}"
|
|
73
|
+
f"<!--<annotation/>-->"
|
|
74
|
+
)
|
|
75
|
+
if mark_annotation
|
|
76
|
+
else ann_text
|
|
77
|
+
),
|
|
78
|
+
span_source=doc_item,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
61
82
|
class MarkdownParams(CommonParams):
|
|
62
83
|
"""Markdown-specific serialization parameters."""
|
|
63
84
|
|
|
@@ -136,6 +157,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
136
157
|
return create_ser_result(text=text, span_source=res_parts)
|
|
137
158
|
|
|
138
159
|
|
|
160
|
+
class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
161
|
+
"""Markdown-specific annotation serializer."""
|
|
162
|
+
|
|
163
|
+
def serialize(
|
|
164
|
+
self,
|
|
165
|
+
*,
|
|
166
|
+
item: DocItem,
|
|
167
|
+
doc: DoclingDocument,
|
|
168
|
+
**kwargs: Any,
|
|
169
|
+
) -> SerializationResult:
|
|
170
|
+
"""Serialize the item's annotations."""
|
|
171
|
+
params = MarkdownParams(**kwargs)
|
|
172
|
+
|
|
173
|
+
res_parts: list[SerializationResult] = []
|
|
174
|
+
for ann in item.get_annotations():
|
|
175
|
+
if isinstance(
|
|
176
|
+
ann,
|
|
177
|
+
(
|
|
178
|
+
PictureClassificationData,
|
|
179
|
+
DescriptionAnnotation,
|
|
180
|
+
PictureMoleculeData,
|
|
181
|
+
),
|
|
182
|
+
):
|
|
183
|
+
if ann_text := _get_annotation_text(ann):
|
|
184
|
+
ann_res = create_ser_result(
|
|
185
|
+
text=(
|
|
186
|
+
(
|
|
187
|
+
f'<!--<annotation kind="{ann.kind}">-->'
|
|
188
|
+
f"{ann_text}"
|
|
189
|
+
f"<!--<annotation/>-->"
|
|
190
|
+
)
|
|
191
|
+
if params.mark_annotations
|
|
192
|
+
else ann_text
|
|
193
|
+
),
|
|
194
|
+
span_source=item,
|
|
195
|
+
)
|
|
196
|
+
res_parts.append(ann_res)
|
|
197
|
+
return create_ser_result(
|
|
198
|
+
text="\n\n".join([r.text for r in res_parts if r.text]),
|
|
199
|
+
span_source=item,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
139
203
|
class MarkdownTableSerializer(BaseTableSerializer):
|
|
140
204
|
"""Markdown-specific table item serializer."""
|
|
141
205
|
|
|
@@ -149,6 +213,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
149
213
|
**kwargs: Any,
|
|
150
214
|
) -> SerializationResult:
|
|
151
215
|
"""Serializes the passed item."""
|
|
216
|
+
params = MarkdownParams(**kwargs)
|
|
152
217
|
res_parts: list[SerializationResult] = []
|
|
153
218
|
|
|
154
219
|
cap_res = doc_serializer.serialize_captions(
|
|
@@ -159,6 +224,16 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
159
224
|
res_parts.append(cap_res)
|
|
160
225
|
|
|
161
226
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
227
|
+
|
|
228
|
+
if params.include_annotations:
|
|
229
|
+
|
|
230
|
+
ann_res = doc_serializer.serialize_annotations(
|
|
231
|
+
item=item,
|
|
232
|
+
**kwargs,
|
|
233
|
+
)
|
|
234
|
+
if ann_res.text:
|
|
235
|
+
res_parts.append(ann_res)
|
|
236
|
+
|
|
162
237
|
rows = [
|
|
163
238
|
[
|
|
164
239
|
# make sure that md tables are not broken
|
|
@@ -214,22 +289,12 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
214
289
|
|
|
215
290
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
216
291
|
if params.include_annotations:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
f'<!--<annotation kind="{ann.kind}">-->'
|
|
224
|
-
f"{ann_text}"
|
|
225
|
-
f"<!--<annotation/>-->"
|
|
226
|
-
)
|
|
227
|
-
if params.mark_annotations
|
|
228
|
-
else ann_text
|
|
229
|
-
),
|
|
230
|
-
span_source=item,
|
|
231
|
-
)
|
|
232
|
-
res_parts.append(ann_ser_res)
|
|
292
|
+
ann_res = doc_serializer.serialize_annotations(
|
|
293
|
+
item=item,
|
|
294
|
+
**kwargs,
|
|
295
|
+
)
|
|
296
|
+
if ann_res.text:
|
|
297
|
+
res_parts.append(ann_res)
|
|
233
298
|
|
|
234
299
|
img_res = self._serialize_image_part(
|
|
235
300
|
item=item,
|
|
@@ -257,7 +322,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
257
322
|
res_parts.append(
|
|
258
323
|
create_ser_result(text=md_table_content, span_source=item)
|
|
259
324
|
)
|
|
260
|
-
text_res = "\n\n".join([r.text for r in res_parts])
|
|
325
|
+
text_res = "\n\n".join([r.text for r in res_parts if r.text])
|
|
261
326
|
|
|
262
327
|
return create_ser_result(text=text_res, span_source=res_parts)
|
|
263
328
|
|
|
@@ -471,6 +536,8 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
471
536
|
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
472
537
|
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
473
538
|
|
|
539
|
+
annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
|
|
540
|
+
|
|
474
541
|
params: MarkdownParams = MarkdownParams()
|
|
475
542
|
|
|
476
543
|
@override
|
|
@@ -15,7 +15,7 @@ import warnings
|
|
|
15
15
|
from enum import Enum
|
|
16
16
|
from io import BytesIO
|
|
17
17
|
from pathlib import Path
|
|
18
|
-
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
18
|
+
from typing import Any, Dict, Final, List, Literal, Optional, Sequence, Tuple, Union
|
|
19
19
|
from urllib.parse import unquote
|
|
20
20
|
|
|
21
21
|
import pandas as pd
|
|
@@ -30,6 +30,7 @@ from pydantic import (
|
|
|
30
30
|
computed_field,
|
|
31
31
|
field_validator,
|
|
32
32
|
model_validator,
|
|
33
|
+
validate_call,
|
|
33
34
|
)
|
|
34
35
|
from tabulate import tabulate
|
|
35
36
|
from typing_extensions import Annotated, Self, deprecated
|
|
@@ -53,7 +54,7 @@ _logger = logging.getLogger(__name__)
|
|
|
53
54
|
|
|
54
55
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
55
56
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
56
|
-
CURRENT_VERSION: Final = "1.
|
|
57
|
+
CURRENT_VERSION: Final = "1.4.0"
|
|
57
58
|
|
|
58
59
|
DEFAULT_EXPORT_LABELS = {
|
|
59
60
|
DocItemLabel.TITLE,
|
|
@@ -85,8 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
|
|
|
85
86
|
)
|
|
86
87
|
|
|
87
88
|
|
|
88
|
-
class
|
|
89
|
-
"""
|
|
89
|
+
class BaseAnnotation(BaseModel):
|
|
90
|
+
"""Base class for all annotation types."""
|
|
90
91
|
|
|
91
92
|
kind: str
|
|
92
93
|
|
|
@@ -98,7 +99,7 @@ class PictureClassificationClass(BaseModel):
|
|
|
98
99
|
confidence: float
|
|
99
100
|
|
|
100
101
|
|
|
101
|
-
class PictureClassificationData(
|
|
102
|
+
class PictureClassificationData(BaseAnnotation):
|
|
102
103
|
"""PictureClassificationData."""
|
|
103
104
|
|
|
104
105
|
kind: Literal["classification"] = "classification"
|
|
@@ -106,19 +107,18 @@ class PictureClassificationData(BasePictureData):
|
|
|
106
107
|
predicted_classes: List[PictureClassificationClass]
|
|
107
108
|
|
|
108
109
|
|
|
109
|
-
class
|
|
110
|
-
"""
|
|
110
|
+
class DescriptionAnnotation(BaseAnnotation):
|
|
111
|
+
"""DescriptionAnnotation."""
|
|
111
112
|
|
|
112
113
|
kind: Literal["description"] = "description"
|
|
113
114
|
text: str
|
|
114
115
|
provenance: str
|
|
115
116
|
|
|
116
117
|
|
|
117
|
-
class PictureMoleculeData(
|
|
118
|
+
class PictureMoleculeData(BaseAnnotation):
|
|
118
119
|
"""PictureMoleculeData."""
|
|
119
120
|
|
|
120
121
|
kind: Literal["molecule_data"] = "molecule_data"
|
|
121
|
-
|
|
122
122
|
smi: str
|
|
123
123
|
confidence: float
|
|
124
124
|
class_name: str
|
|
@@ -126,13 +126,19 @@ class PictureMoleculeData(BaseModel):
|
|
|
126
126
|
provenance: str
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
class
|
|
130
|
-
"""
|
|
129
|
+
class MiscAnnotation(BaseAnnotation):
|
|
130
|
+
"""MiscAnnotation."""
|
|
131
131
|
|
|
132
132
|
kind: Literal["misc"] = "misc"
|
|
133
133
|
content: Dict[str, Any]
|
|
134
134
|
|
|
135
135
|
|
|
136
|
+
# deprecated aliases:
|
|
137
|
+
BasePictureData = BaseAnnotation
|
|
138
|
+
PictureDescriptionData = DescriptionAnnotation
|
|
139
|
+
PictureMiscData = MiscAnnotation
|
|
140
|
+
|
|
141
|
+
|
|
136
142
|
class ChartLine(BaseModel):
|
|
137
143
|
"""Represents a line in a line chart.
|
|
138
144
|
|
|
@@ -196,7 +202,7 @@ class ChartPoint(BaseModel):
|
|
|
196
202
|
value: Tuple[float, float]
|
|
197
203
|
|
|
198
204
|
|
|
199
|
-
class PictureChartData(
|
|
205
|
+
class PictureChartData(BaseAnnotation):
|
|
200
206
|
"""Base class for picture chart data.
|
|
201
207
|
|
|
202
208
|
Attributes:
|
|
@@ -381,10 +387,10 @@ class PictureTabularChartData(PictureChartData):
|
|
|
381
387
|
|
|
382
388
|
PictureDataType = Annotated[
|
|
383
389
|
Union[
|
|
390
|
+
DescriptionAnnotation,
|
|
391
|
+
MiscAnnotation,
|
|
384
392
|
PictureClassificationData,
|
|
385
|
-
PictureDescriptionData,
|
|
386
393
|
PictureMoleculeData,
|
|
387
|
-
PictureMiscData,
|
|
388
394
|
PictureTabularChartData,
|
|
389
395
|
PictureLineChartData,
|
|
390
396
|
PictureBarChartData,
|
|
@@ -818,6 +824,18 @@ class DocItem(
|
|
|
818
824
|
)
|
|
819
825
|
return page_image.crop(crop_bbox.as_tuple())
|
|
820
826
|
|
|
827
|
+
def get_annotations(self) -> Sequence[BaseAnnotation]:
|
|
828
|
+
"""Get the annotations of this DocItem."""
|
|
829
|
+
return []
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
class Script(str, Enum):
|
|
833
|
+
"""Text script position."""
|
|
834
|
+
|
|
835
|
+
BASELINE = "baseline"
|
|
836
|
+
SUB = "sub"
|
|
837
|
+
SUPER = "super"
|
|
838
|
+
|
|
821
839
|
|
|
822
840
|
class Formatting(BaseModel):
|
|
823
841
|
"""Formatting."""
|
|
@@ -826,6 +844,7 @@ class Formatting(BaseModel):
|
|
|
826
844
|
italic: bool = False
|
|
827
845
|
underline: bool = False
|
|
828
846
|
strikethrough: bool = False
|
|
847
|
+
script: Script = Script.BASELINE
|
|
829
848
|
|
|
830
849
|
|
|
831
850
|
class TextItem(DocItem):
|
|
@@ -1182,6 +1201,19 @@ class PictureItem(FloatingItem):
|
|
|
1182
1201
|
text = serializer.serialize(item=self).text
|
|
1183
1202
|
return text
|
|
1184
1203
|
|
|
1204
|
+
def get_annotations(self) -> Sequence[BaseAnnotation]:
|
|
1205
|
+
"""Get the annotations of this PictureItem."""
|
|
1206
|
+
return self.annotations
|
|
1207
|
+
|
|
1208
|
+
|
|
1209
|
+
TableAnnotationType = Annotated[
|
|
1210
|
+
Union[
|
|
1211
|
+
DescriptionAnnotation,
|
|
1212
|
+
MiscAnnotation,
|
|
1213
|
+
],
|
|
1214
|
+
Field(discriminator="kind"),
|
|
1215
|
+
]
|
|
1216
|
+
|
|
1185
1217
|
|
|
1186
1218
|
class TableItem(FloatingItem):
|
|
1187
1219
|
"""TableItem."""
|
|
@@ -1192,6 +1224,8 @@ class TableItem(FloatingItem):
|
|
|
1192
1224
|
DocItemLabel.TABLE,
|
|
1193
1225
|
] = DocItemLabel.TABLE
|
|
1194
1226
|
|
|
1227
|
+
annotations: List[TableAnnotationType] = []
|
|
1228
|
+
|
|
1195
1229
|
def export_to_dataframe(self) -> pd.DataFrame:
|
|
1196
1230
|
"""Export the table as a Pandas DataFrame."""
|
|
1197
1231
|
if self.data.num_rows == 0 or self.data.num_cols == 0:
|
|
@@ -1438,6 +1472,15 @@ class TableItem(FloatingItem):
|
|
|
1438
1472
|
text = serializer.serialize(item=self).text
|
|
1439
1473
|
return text
|
|
1440
1474
|
|
|
1475
|
+
@validate_call
|
|
1476
|
+
def add_annotation(self, annotation: TableAnnotationType) -> None:
|
|
1477
|
+
"""Add an annotation to the table."""
|
|
1478
|
+
self.annotations.append(annotation)
|
|
1479
|
+
|
|
1480
|
+
def get_annotations(self) -> Sequence[BaseAnnotation]:
|
|
1481
|
+
"""Get the annotations of this TableItem."""
|
|
1482
|
+
return self.annotations
|
|
1483
|
+
|
|
1441
1484
|
|
|
1442
1485
|
class GraphCell(BaseModel):
|
|
1443
1486
|
"""GraphCell."""
|
|
@@ -1776,6 +1819,18 @@ class DoclingDocument(BaseModel):
|
|
|
1776
1819
|
item.parent = parent_ref
|
|
1777
1820
|
|
|
1778
1821
|
self.form_items.append(item)
|
|
1822
|
+
|
|
1823
|
+
elif isinstance(item, (UnorderedList, OrderedList, InlineGroup)):
|
|
1824
|
+
item_label = "groups"
|
|
1825
|
+
item_index = len(self.groups)
|
|
1826
|
+
|
|
1827
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1828
|
+
|
|
1829
|
+
item.self_ref = cref
|
|
1830
|
+
item.parent = parent_ref
|
|
1831
|
+
|
|
1832
|
+
self.groups.append(item)
|
|
1833
|
+
|
|
1779
1834
|
else:
|
|
1780
1835
|
raise ValueError(f"Item {item} is not supported for insertion")
|
|
1781
1836
|
|
|
@@ -2111,8 +2166,8 @@ class DoclingDocument(BaseModel):
|
|
|
2111
2166
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
2112
2167
|
|
|
2113
2168
|
"""
|
|
2114
|
-
if not parent:
|
|
2115
|
-
parent
|
|
2169
|
+
if not isinstance(parent, (OrderedList, UnorderedList)):
|
|
2170
|
+
raise ValueError("ListItem's parent must be a list group")
|
|
2116
2171
|
|
|
2117
2172
|
if not orig:
|
|
2118
2173
|
orig = text
|
|
@@ -2267,6 +2322,7 @@ class DoclingDocument(BaseModel):
|
|
|
2267
2322
|
parent: Optional[NodeItem] = None,
|
|
2268
2323
|
label: DocItemLabel = DocItemLabel.TABLE,
|
|
2269
2324
|
content_layer: Optional[ContentLayer] = None,
|
|
2325
|
+
annotations: Optional[list[TableAnnotationType]] = None,
|
|
2270
2326
|
):
|
|
2271
2327
|
"""add_table.
|
|
2272
2328
|
|
|
@@ -2284,7 +2340,11 @@ class DoclingDocument(BaseModel):
|
|
|
2284
2340
|
cref = f"#/tables/{table_index}"
|
|
2285
2341
|
|
|
2286
2342
|
tbl_item = TableItem(
|
|
2287
|
-
label=label,
|
|
2343
|
+
label=label,
|
|
2344
|
+
data=data,
|
|
2345
|
+
self_ref=cref,
|
|
2346
|
+
parent=parent.get_ref(),
|
|
2347
|
+
annotations=annotations or [],
|
|
2288
2348
|
)
|
|
2289
2349
|
if prov:
|
|
2290
2350
|
tbl_item.prov.append(prov)
|
|
@@ -2301,7 +2361,7 @@ class DoclingDocument(BaseModel):
|
|
|
2301
2361
|
|
|
2302
2362
|
def add_picture(
|
|
2303
2363
|
self,
|
|
2304
|
-
annotations: List[PictureDataType] =
|
|
2364
|
+
annotations: Optional[List[PictureDataType]] = None,
|
|
2305
2365
|
image: Optional[ImageRef] = None,
|
|
2306
2366
|
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
2307
2367
|
prov: Optional[ProvenanceItem] = None,
|
|
@@ -2310,7 +2370,7 @@ class DoclingDocument(BaseModel):
|
|
|
2310
2370
|
):
|
|
2311
2371
|
"""add_picture.
|
|
2312
2372
|
|
|
2313
|
-
:param data: List[PictureData]: (Default value =
|
|
2373
|
+
:param data: Optional[List[PictureData]]: (Default value = None)
|
|
2314
2374
|
:param caption: Optional[Union[TextItem:
|
|
2315
2375
|
:param RefItem]]: (Default value = None)
|
|
2316
2376
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
@@ -2324,7 +2384,7 @@ class DoclingDocument(BaseModel):
|
|
|
2324
2384
|
|
|
2325
2385
|
fig_item = PictureItem(
|
|
2326
2386
|
label=DocItemLabel.PICTURE,
|
|
2327
|
-
annotations=annotations,
|
|
2387
|
+
annotations=annotations or [],
|
|
2328
2388
|
image=image,
|
|
2329
2389
|
self_ref=cref,
|
|
2330
2390
|
parent=parent.get_ref(),
|
|
@@ -3589,6 +3649,52 @@ class DoclingDocument(BaseModel):
|
|
|
3589
3649
|
|
|
3590
3650
|
return (GraphData(cells=cells, links=links), overall_prov)
|
|
3591
3651
|
|
|
3652
|
+
def _add_text(
|
|
3653
|
+
full_chunk: str,
|
|
3654
|
+
bbox: Optional[BoundingBox],
|
|
3655
|
+
pg_width: int,
|
|
3656
|
+
pg_height: int,
|
|
3657
|
+
page_no: int,
|
|
3658
|
+
tag_name: str,
|
|
3659
|
+
doc_label: DocItemLabel,
|
|
3660
|
+
doc: DoclingDocument,
|
|
3661
|
+
parent: Optional[NodeItem],
|
|
3662
|
+
):
|
|
3663
|
+
# For everything else, treat as text
|
|
3664
|
+
text_content = extract_inner_text(full_chunk)
|
|
3665
|
+
element_prov = (
|
|
3666
|
+
ProvenanceItem(
|
|
3667
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3668
|
+
charspan=(0, len(text_content)),
|
|
3669
|
+
page_no=page_no,
|
|
3670
|
+
)
|
|
3671
|
+
if bbox
|
|
3672
|
+
else None
|
|
3673
|
+
)
|
|
3674
|
+
|
|
3675
|
+
content_layer = ContentLayer.BODY
|
|
3676
|
+
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
3677
|
+
content_layer = ContentLayer.FURNITURE
|
|
3678
|
+
|
|
3679
|
+
if doc_label == DocItemLabel.SECTION_HEADER:
|
|
3680
|
+
# Extract level from tag_name (e.g. "section_level_header_1" -> 1)
|
|
3681
|
+
level = int(tag_name.split("_")[-1])
|
|
3682
|
+
doc.add_heading(
|
|
3683
|
+
text=text_content,
|
|
3684
|
+
level=level,
|
|
3685
|
+
prov=element_prov,
|
|
3686
|
+
parent=parent,
|
|
3687
|
+
content_layer=content_layer,
|
|
3688
|
+
)
|
|
3689
|
+
else:
|
|
3690
|
+
doc.add_text(
|
|
3691
|
+
label=doc_label,
|
|
3692
|
+
text=text_content,
|
|
3693
|
+
prov=element_prov,
|
|
3694
|
+
parent=parent,
|
|
3695
|
+
content_layer=content_layer,
|
|
3696
|
+
)
|
|
3697
|
+
|
|
3592
3698
|
# doc = DoclingDocument(name="Document")
|
|
3593
3699
|
for pg_idx, doctag_page in enumerate(doctag_document.pages):
|
|
3594
3700
|
page_doctags = doctag_page.tokens
|
|
@@ -3623,7 +3729,7 @@ class DoclingDocument(BaseModel):
|
|
|
3623
3729
|
tag_pattern = (
|
|
3624
3730
|
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
|
3625
3731
|
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
|
3626
|
-
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
|
3732
|
+
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|{GroupLabel.INLINE}|"
|
|
3627
3733
|
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
|
3628
3734
|
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
|
3629
3735
|
rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
|
@@ -3648,7 +3754,7 @@ class DoclingDocument(BaseModel):
|
|
|
3648
3754
|
# no closing tag; only the existence of the item is recovered
|
|
3649
3755
|
full_chunk = f"<{tag_name}></{tag_name}>"
|
|
3650
3756
|
|
|
3651
|
-
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.
|
|
3757
|
+
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
|
|
3652
3758
|
|
|
3653
3759
|
if tag_name == DocumentToken.OTSL.value:
|
|
3654
3760
|
table_data = parse_table_content(full_chunk)
|
|
@@ -3671,6 +3777,24 @@ class DoclingDocument(BaseModel):
|
|
|
3671
3777
|
else:
|
|
3672
3778
|
doc.add_table(data=table_data, caption=caption)
|
|
3673
3779
|
|
|
3780
|
+
elif tag_name == GroupLabel.INLINE:
|
|
3781
|
+
inline_group = doc.add_inline_group()
|
|
3782
|
+
content = match.group("content")
|
|
3783
|
+
common_bbox = extract_bounding_box(content)
|
|
3784
|
+
for item_match in pattern.finditer(content):
|
|
3785
|
+
item_tag = item_match.group("tag")
|
|
3786
|
+
_add_text(
|
|
3787
|
+
full_chunk=item_match.group(0),
|
|
3788
|
+
bbox=common_bbox,
|
|
3789
|
+
pg_width=pg_width,
|
|
3790
|
+
pg_height=pg_height,
|
|
3791
|
+
page_no=page_no,
|
|
3792
|
+
tag_name=item_tag,
|
|
3793
|
+
doc_label=tag_to_doclabel.get(item_tag, DocItemLabel.TEXT),
|
|
3794
|
+
doc=doc,
|
|
3795
|
+
parent=inline_group,
|
|
3796
|
+
)
|
|
3797
|
+
|
|
3674
3798
|
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
|
|
3675
3799
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
3676
3800
|
table_data = None
|
|
@@ -3820,38 +3944,17 @@ class DoclingDocument(BaseModel):
|
|
|
3820
3944
|
)
|
|
3821
3945
|
else:
|
|
3822
3946
|
# For everything else, treat as text
|
|
3823
|
-
|
|
3824
|
-
|
|
3825
|
-
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
|
|
3947
|
+
_add_text(
|
|
3948
|
+
full_chunk=full_chunk,
|
|
3949
|
+
bbox=bbox,
|
|
3950
|
+
pg_width=pg_width,
|
|
3951
|
+
pg_height=pg_height,
|
|
3952
|
+
page_no=page_no,
|
|
3953
|
+
tag_name=tag_name,
|
|
3954
|
+
doc_label=doc_label,
|
|
3955
|
+
doc=doc,
|
|
3956
|
+
parent=None,
|
|
3832
3957
|
)
|
|
3833
|
-
|
|
3834
|
-
content_layer = ContentLayer.BODY
|
|
3835
|
-
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
3836
|
-
content_layer = ContentLayer.FURNITURE
|
|
3837
|
-
|
|
3838
|
-
if doc_label == DocItemLabel.SECTION_HEADER:
|
|
3839
|
-
# Extract level from tag_name (e.g. "section_level_header_1" -> 1)
|
|
3840
|
-
level = int(tag_name.split("_")[-1])
|
|
3841
|
-
doc.add_heading(
|
|
3842
|
-
text=text_content,
|
|
3843
|
-
level=level,
|
|
3844
|
-
prov=element_prov,
|
|
3845
|
-
content_layer=content_layer,
|
|
3846
|
-
)
|
|
3847
|
-
else:
|
|
3848
|
-
doc.add_text(
|
|
3849
|
-
label=doc_label,
|
|
3850
|
-
text=text_content,
|
|
3851
|
-
prov=element_prov,
|
|
3852
|
-
content_layer=content_layer,
|
|
3853
|
-
)
|
|
3854
|
-
|
|
3855
3958
|
return doc
|
|
3856
3959
|
|
|
3857
3960
|
@deprecated("Use save_as_doctags instead.")
|
|
@@ -4149,3 +4252,58 @@ class DoclingDocument(BaseModel):
|
|
|
4149
4252
|
raise ValueError("Document hierachy is inconsistent.")
|
|
4150
4253
|
|
|
4151
4254
|
return d
|
|
4255
|
+
|
|
4256
|
+
@model_validator(mode="after")
|
|
4257
|
+
def validate_misplaced_list_items(self):
|
|
4258
|
+
"""validate_misplaced_list_items."""
|
|
4259
|
+
# find list items without list parent, putting succesive ones together
|
|
4260
|
+
misplaced_list_items: list[list[ListItem]] = []
|
|
4261
|
+
prev: Optional[NodeItem] = None
|
|
4262
|
+
for item, _ in self.iterate_items(
|
|
4263
|
+
traverse_pictures=True,
|
|
4264
|
+
included_content_layers={c for c in ContentLayer},
|
|
4265
|
+
with_groups=True, # so that we can distinguish neighboring lists
|
|
4266
|
+
):
|
|
4267
|
+
if isinstance(item, ListItem) and (
|
|
4268
|
+
item.parent is None
|
|
4269
|
+
or not isinstance(
|
|
4270
|
+
item.parent.resolve(doc=self), (OrderedList, UnorderedList)
|
|
4271
|
+
)
|
|
4272
|
+
):
|
|
4273
|
+
# non_group_list_items.append(item)
|
|
4274
|
+
if prev is None or not isinstance(prev, ListItem): # if new list
|
|
4275
|
+
misplaced_list_items.append([item])
|
|
4276
|
+
else:
|
|
4277
|
+
misplaced_list_items[-1].append(item)
|
|
4278
|
+
prev = item
|
|
4279
|
+
|
|
4280
|
+
for curr_list_items in reversed(misplaced_list_items):
|
|
4281
|
+
|
|
4282
|
+
# add group
|
|
4283
|
+
new_group = (
|
|
4284
|
+
OrderedList(self_ref="#")
|
|
4285
|
+
if curr_list_items[0].enumerated
|
|
4286
|
+
else UnorderedList(self_ref="#")
|
|
4287
|
+
)
|
|
4288
|
+
self.insert_item_before_sibling(
|
|
4289
|
+
new_item=new_group,
|
|
4290
|
+
sibling=curr_list_items[0],
|
|
4291
|
+
)
|
|
4292
|
+
|
|
4293
|
+
# delete list items from document (should not be affected by group addition)
|
|
4294
|
+
self.delete_items(node_items=curr_list_items)
|
|
4295
|
+
|
|
4296
|
+
# add list items to new group
|
|
4297
|
+
for li in curr_list_items:
|
|
4298
|
+
self.add_list_item(
|
|
4299
|
+
text=li.text,
|
|
4300
|
+
enumerated=li.enumerated,
|
|
4301
|
+
marker=li.marker,
|
|
4302
|
+
orig=li.orig,
|
|
4303
|
+
prov=li.prov[0] if li.prov else None,
|
|
4304
|
+
parent=new_group,
|
|
4305
|
+
content_layer=li.content_layer,
|
|
4306
|
+
formatting=li.formatting,
|
|
4307
|
+
hyperlink=li.hyperlink,
|
|
4308
|
+
)
|
|
4309
|
+
return self
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.34.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -26,12 +26,12 @@ docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP
|
|
|
26
26
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
|
|
27
27
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
28
28
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
29
|
-
docling_core/transforms/serializer/base.py,sha256=
|
|
30
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
31
|
-
docling_core/transforms/serializer/doctags.py,sha256=
|
|
32
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
29
|
+
docling_core/transforms/serializer/base.py,sha256=ZFIiZeplL-QbBs9EDUb1awqxapQ23PsApVetJtAs7Vs,6891
|
|
30
|
+
docling_core/transforms/serializer/common.py,sha256=WP-qO-woidrKyvZ56m0vlKMysoLrMzzZtHSCIwsl3ek,19119
|
|
31
|
+
docling_core/transforms/serializer/doctags.py,sha256=PuAExlP-2HxcDSP_R_phtYQU0yKBW94RrPgb85IUxck,19905
|
|
32
|
+
docling_core/transforms/serializer/html.py,sha256=KiywrroYBS3yk07gQizlmk3oqkXg_NpFwE0VF31_Z-I,37112
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
34
|
+
docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
36
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
37
|
docling_core/transforms/visualizer/layout_visualizer.py,sha256=ulXxWGIl69-HMKDPFk_XKgNCgQeDNc969PVt_X0-drA,7823
|
|
@@ -40,7 +40,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
40
40
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
41
41
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
42
42
|
docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
|
|
43
|
-
docling_core/types/doc/document.py,sha256=
|
|
43
|
+
docling_core/types/doc/document.py,sha256=VKZg1VT-H8gTXybgY6lRlcKKR3f6mFDB9UzcrLtII5I,148197
|
|
44
44
|
docling_core/types/doc/labels.py,sha256=vp4h3e7AmBvezRmgrfuPehjAHTZOufphErLB4ENhdME,7171
|
|
45
45
|
docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
|
|
46
46
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -73,9 +73,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
73
73
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
74
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
75
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
76
|
-
docling_core-2.
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
76
|
+
docling_core-2.34.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.34.0.dist-info/METADATA,sha256=853af3C8OZrbXzZqYFhfDfu-gtG4m7my-6wqzCir_cg,6453
|
|
78
|
+
docling_core-2.34.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
79
|
+
docling_core-2.34.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
80
|
+
docling_core-2.34.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
81
|
+
docling_core-2.34.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|