docling-core 2.49.0__py3-none-any.whl → 2.50.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/cli/view.py +21 -5
- docling_core/transforms/serializer/base.py +31 -0
- docling_core/transforms/serializer/common.py +180 -100
- docling_core/transforms/serializer/doctags.py +35 -20
- docling_core/transforms/serializer/html.py +78 -3
- docling_core/transforms/serializer/markdown.py +109 -5
- docling_core/types/doc/__init__.py +11 -0
- docling_core/types/doc/document.py +358 -7
- docling_core/types/doc/tokens.py +6 -0
- {docling_core-2.49.0.dist-info → docling_core-2.50.0.dist-info}/METADATA +1 -1
- {docling_core-2.49.0.dist-info → docling_core-2.50.0.dist-info}/RECORD +15 -15
- {docling_core-2.49.0.dist-info → docling_core-2.50.0.dist-info}/WHEEL +0 -0
- {docling_core-2.49.0.dist-info → docling_core-2.50.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.49.0.dist-info → docling_core-2.50.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.49.0.dist-info → docling_core-2.50.0.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ from xml.sax.saxutils import unescape
|
|
|
17
17
|
|
|
18
18
|
import latex2mathml.converter
|
|
19
19
|
from PIL.Image import Image
|
|
20
|
-
from pydantic import AnyUrl, BaseModel
|
|
20
|
+
from pydantic import AnyUrl, BaseModel, Field
|
|
21
21
|
from typing_extensions import override
|
|
22
22
|
|
|
23
23
|
from docling_core.transforms.serializer.base import (
|
|
@@ -28,6 +28,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
28
28
|
BaseInlineSerializer,
|
|
29
29
|
BaseKeyValueSerializer,
|
|
30
30
|
BaseListSerializer,
|
|
31
|
+
BaseMetaSerializer,
|
|
31
32
|
BasePictureSerializer,
|
|
32
33
|
BaseTableSerializer,
|
|
33
34
|
BaseTextSerializer,
|
|
@@ -46,9 +47,11 @@ from docling_core.transforms.serializer.html_styles import (
|
|
|
46
47
|
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
47
48
|
from docling_core.types.doc.base import ImageRefMode
|
|
48
49
|
from docling_core.types.doc.document import (
|
|
50
|
+
BaseMeta,
|
|
49
51
|
CodeItem,
|
|
50
52
|
ContentLayer,
|
|
51
53
|
DescriptionAnnotation,
|
|
54
|
+
DescriptionMetaField,
|
|
52
55
|
DocItem,
|
|
53
56
|
DoclingDocument,
|
|
54
57
|
FloatingItem,
|
|
@@ -61,14 +64,18 @@ from docling_core.types.doc.document import (
|
|
|
61
64
|
KeyValueItem,
|
|
62
65
|
ListGroup,
|
|
63
66
|
ListItem,
|
|
67
|
+
MoleculeMetaField,
|
|
64
68
|
NodeItem,
|
|
65
69
|
PictureClassificationData,
|
|
70
|
+
PictureClassificationMetaField,
|
|
66
71
|
PictureItem,
|
|
67
72
|
PictureMoleculeData,
|
|
68
73
|
PictureTabularChartData,
|
|
69
74
|
RichTableCell,
|
|
70
75
|
SectionHeaderItem,
|
|
76
|
+
SummaryMetaField,
|
|
71
77
|
TableItem,
|
|
78
|
+
TabularChartMetaField,
|
|
72
79
|
TextItem,
|
|
73
80
|
TitleItem,
|
|
74
81
|
)
|
|
@@ -115,7 +122,11 @@ class HTMLParams(CommonParams):
|
|
|
115
122
|
# Enable charts to be printed into HTML as tables
|
|
116
123
|
enable_chart_tables: bool = True
|
|
117
124
|
|
|
118
|
-
include_annotations: bool =
|
|
125
|
+
include_annotations: bool = Field(
|
|
126
|
+
default=True,
|
|
127
|
+
description="Include item annotations.",
|
|
128
|
+
deprecated="Use include_meta instead.",
|
|
129
|
+
)
|
|
119
130
|
|
|
120
131
|
show_original_list_item_marker: bool = True
|
|
121
132
|
|
|
@@ -808,6 +819,65 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
808
819
|
)
|
|
809
820
|
|
|
810
821
|
|
|
822
|
+
class HTMLMetaSerializer(BaseModel, BaseMetaSerializer):
|
|
823
|
+
"""HTML-specific meta serializer."""
|
|
824
|
+
|
|
825
|
+
@override
|
|
826
|
+
def serialize(
|
|
827
|
+
self,
|
|
828
|
+
*,
|
|
829
|
+
item: NodeItem,
|
|
830
|
+
doc: DoclingDocument,
|
|
831
|
+
**kwargs: Any,
|
|
832
|
+
) -> SerializationResult:
|
|
833
|
+
"""Serialize the item's meta."""
|
|
834
|
+
params = HTMLParams(**kwargs)
|
|
835
|
+
return create_ser_result(
|
|
836
|
+
text="\n".join(
|
|
837
|
+
[
|
|
838
|
+
tmp
|
|
839
|
+
for key in (
|
|
840
|
+
list(item.meta.__class__.model_fields)
|
|
841
|
+
+ list(item.meta.get_custom_part())
|
|
842
|
+
)
|
|
843
|
+
if (
|
|
844
|
+
(
|
|
845
|
+
params.allowed_meta_names is None
|
|
846
|
+
or key in params.allowed_meta_names
|
|
847
|
+
)
|
|
848
|
+
and (key not in params.blocked_meta_names)
|
|
849
|
+
and (tmp := self._serialize_meta_field(item.meta, key))
|
|
850
|
+
)
|
|
851
|
+
]
|
|
852
|
+
if item.meta
|
|
853
|
+
else []
|
|
854
|
+
),
|
|
855
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
856
|
+
# NOTE for now using an empty span source for GroupItems
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
|
|
860
|
+
if (field_val := getattr(meta, name)) is not None:
|
|
861
|
+
if isinstance(field_val, SummaryMetaField):
|
|
862
|
+
txt = field_val.text
|
|
863
|
+
elif isinstance(field_val, DescriptionMetaField):
|
|
864
|
+
txt = field_val.text
|
|
865
|
+
elif isinstance(field_val, PictureClassificationMetaField):
|
|
866
|
+
txt = self._humanize_text(field_val.get_main_prediction().class_name)
|
|
867
|
+
elif isinstance(field_val, MoleculeMetaField):
|
|
868
|
+
txt = field_val.smi
|
|
869
|
+
elif isinstance(field_val, TabularChartMetaField):
|
|
870
|
+
# suppressing tabular chart serialization
|
|
871
|
+
return None
|
|
872
|
+
elif tmp := str(field_val or ""):
|
|
873
|
+
txt = tmp
|
|
874
|
+
else:
|
|
875
|
+
return None
|
|
876
|
+
return f"<div data-meta-{name}>{txt}</div>"
|
|
877
|
+
else:
|
|
878
|
+
return None
|
|
879
|
+
|
|
880
|
+
|
|
811
881
|
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
812
882
|
"""HTML-specific annotation serializer."""
|
|
813
883
|
|
|
@@ -858,6 +928,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
858
928
|
list_serializer: BaseListSerializer = HTMLListSerializer()
|
|
859
929
|
inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
|
|
860
930
|
|
|
931
|
+
meta_serializer: BaseMetaSerializer = HTMLMetaSerializer()
|
|
861
932
|
annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
|
|
862
933
|
|
|
863
934
|
params: HTMLParams = HTMLParams()
|
|
@@ -1047,7 +1118,11 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1047
1118
|
)
|
|
1048
1119
|
results.append(cap_ser_res)
|
|
1049
1120
|
|
|
1050
|
-
if
|
|
1121
|
+
if (
|
|
1122
|
+
params.use_legacy_annotations
|
|
1123
|
+
and params.include_annotations
|
|
1124
|
+
and item.self_ref not in excluded_refs
|
|
1125
|
+
):
|
|
1051
1126
|
if isinstance(item, (PictureItem, TableItem)):
|
|
1052
1127
|
ann_res = self.serialize_annotations(
|
|
1053
1128
|
item=item,
|
|
@@ -11,7 +11,7 @@ from enum import Enum
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any, Optional, Union
|
|
13
13
|
|
|
14
|
-
from pydantic import AnyUrl, BaseModel, PositiveInt
|
|
14
|
+
from pydantic import AnyUrl, BaseModel, Field, PositiveInt
|
|
15
15
|
from tabulate import tabulate
|
|
16
16
|
from typing_extensions import override
|
|
17
17
|
|
|
@@ -23,6 +23,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
23
23
|
BaseInlineSerializer,
|
|
24
24
|
BaseKeyValueSerializer,
|
|
25
25
|
BaseListSerializer,
|
|
26
|
+
BaseMetaSerializer,
|
|
26
27
|
BasePictureSerializer,
|
|
27
28
|
BaseTableSerializer,
|
|
28
29
|
BaseTextSerializer,
|
|
@@ -36,9 +37,11 @@ from docling_core.transforms.serializer.common import (
|
|
|
36
37
|
)
|
|
37
38
|
from docling_core.types.doc.base import ImageRefMode
|
|
38
39
|
from docling_core.types.doc.document import (
|
|
40
|
+
BaseMeta,
|
|
39
41
|
CodeItem,
|
|
40
42
|
ContentLayer,
|
|
41
43
|
DescriptionAnnotation,
|
|
44
|
+
DescriptionMetaField,
|
|
42
45
|
DocItem,
|
|
43
46
|
DocItemLabel,
|
|
44
47
|
DoclingDocument,
|
|
@@ -52,14 +55,18 @@ from docling_core.types.doc.document import (
|
|
|
52
55
|
KeyValueItem,
|
|
53
56
|
ListGroup,
|
|
54
57
|
ListItem,
|
|
58
|
+
MoleculeMetaField,
|
|
55
59
|
NodeItem,
|
|
56
60
|
PictureClassificationData,
|
|
61
|
+
PictureClassificationMetaField,
|
|
57
62
|
PictureItem,
|
|
58
63
|
PictureMoleculeData,
|
|
59
64
|
PictureTabularChartData,
|
|
60
65
|
RichTableCell,
|
|
61
66
|
SectionHeaderItem,
|
|
67
|
+
SummaryMetaField,
|
|
62
68
|
TableItem,
|
|
69
|
+
TabularChartMetaField,
|
|
63
70
|
TextItem,
|
|
64
71
|
TitleItem,
|
|
65
72
|
)
|
|
@@ -102,8 +109,17 @@ class MarkdownParams(CommonParams):
|
|
|
102
109
|
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
|
|
103
110
|
escape_underscores: bool = True
|
|
104
111
|
escape_html: bool = True
|
|
105
|
-
|
|
106
|
-
|
|
112
|
+
mark_meta: bool = Field(default=False, description="Mark meta sections.")
|
|
113
|
+
include_annotations: bool = Field(
|
|
114
|
+
default=True,
|
|
115
|
+
description="Include item annotations.",
|
|
116
|
+
deprecated="Use include_meta instead.",
|
|
117
|
+
)
|
|
118
|
+
mark_annotations: bool = Field(
|
|
119
|
+
default=False,
|
|
120
|
+
description="Mark annotation sections.",
|
|
121
|
+
deprecated="Use mark_meta instead.",
|
|
122
|
+
)
|
|
107
123
|
orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
|
|
108
124
|
ensure_valid_list_item_marker: bool = True
|
|
109
125
|
|
|
@@ -245,9 +261,77 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
245
261
|
return create_ser_result(text=text, span_source=res_parts)
|
|
246
262
|
|
|
247
263
|
|
|
264
|
+
class MarkdownMetaSerializer(BaseModel, BaseMetaSerializer):
|
|
265
|
+
"""Markdown-specific meta serializer."""
|
|
266
|
+
|
|
267
|
+
@override
|
|
268
|
+
def serialize(
|
|
269
|
+
self,
|
|
270
|
+
*,
|
|
271
|
+
item: NodeItem,
|
|
272
|
+
doc: DoclingDocument,
|
|
273
|
+
**kwargs: Any,
|
|
274
|
+
) -> SerializationResult:
|
|
275
|
+
"""Serialize the item's meta."""
|
|
276
|
+
params = MarkdownParams(**kwargs)
|
|
277
|
+
return create_ser_result(
|
|
278
|
+
text="\n\n".join(
|
|
279
|
+
[
|
|
280
|
+
tmp
|
|
281
|
+
for key in (
|
|
282
|
+
list(item.meta.__class__.model_fields)
|
|
283
|
+
+ list(item.meta.get_custom_part())
|
|
284
|
+
)
|
|
285
|
+
if (
|
|
286
|
+
(
|
|
287
|
+
params.allowed_meta_names is None
|
|
288
|
+
or key in params.allowed_meta_names
|
|
289
|
+
)
|
|
290
|
+
and (key not in params.blocked_meta_names)
|
|
291
|
+
and (
|
|
292
|
+
tmp := self._serialize_meta_field(
|
|
293
|
+
item.meta, key, params.mark_meta
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
]
|
|
298
|
+
if item.meta
|
|
299
|
+
else []
|
|
300
|
+
),
|
|
301
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
302
|
+
# NOTE for now using an empty span source for GroupItems
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
def _serialize_meta_field(
|
|
306
|
+
self, meta: BaseMeta, name: str, mark_meta: bool
|
|
307
|
+
) -> Optional[str]:
|
|
308
|
+
if (field_val := getattr(meta, name)) is not None:
|
|
309
|
+
if isinstance(field_val, SummaryMetaField):
|
|
310
|
+
txt = field_val.text
|
|
311
|
+
elif isinstance(field_val, DescriptionMetaField):
|
|
312
|
+
txt = field_val.text
|
|
313
|
+
elif isinstance(field_val, PictureClassificationMetaField):
|
|
314
|
+
txt = self._humanize_text(field_val.get_main_prediction().class_name)
|
|
315
|
+
elif isinstance(field_val, MoleculeMetaField):
|
|
316
|
+
txt = field_val.smi
|
|
317
|
+
elif isinstance(field_val, TabularChartMetaField):
|
|
318
|
+
# suppressing tabular chart serialization
|
|
319
|
+
return None
|
|
320
|
+
elif tmp := str(field_val or ""):
|
|
321
|
+
txt = tmp
|
|
322
|
+
else:
|
|
323
|
+
return None
|
|
324
|
+
return (
|
|
325
|
+
f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
|
|
248
331
|
class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
249
332
|
"""Markdown-specific annotation serializer."""
|
|
250
333
|
|
|
334
|
+
@override
|
|
251
335
|
def serialize(
|
|
252
336
|
self,
|
|
253
337
|
*,
|
|
@@ -313,7 +397,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
313
397
|
|
|
314
398
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
315
399
|
|
|
316
|
-
if params.include_annotations:
|
|
400
|
+
if params.use_legacy_annotations and params.include_annotations:
|
|
317
401
|
|
|
318
402
|
ann_res = doc_serializer.serialize_annotations(
|
|
319
403
|
item=item,
|
|
@@ -382,7 +466,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
382
466
|
res_parts.append(cap_res)
|
|
383
467
|
|
|
384
468
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
385
|
-
if params.include_annotations:
|
|
469
|
+
if params.use_legacy_annotations and params.include_annotations:
|
|
386
470
|
ann_res = doc_serializer.serialize_annotations(
|
|
387
471
|
item=item,
|
|
388
472
|
**kwargs,
|
|
@@ -629,6 +713,7 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
629
713
|
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
630
714
|
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
631
715
|
|
|
716
|
+
meta_serializer: BaseMetaSerializer = MarkdownMetaSerializer()
|
|
632
717
|
annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
|
|
633
718
|
|
|
634
719
|
params: MarkdownParams = MarkdownParams()
|
|
@@ -727,3 +812,22 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
727
812
|
def requires_page_break(self) -> bool:
|
|
728
813
|
"""Whether to add page breaks."""
|
|
729
814
|
return self.params.page_break_placeholder is not None
|
|
815
|
+
|
|
816
|
+
@override
|
|
817
|
+
def serialize(
|
|
818
|
+
self,
|
|
819
|
+
*,
|
|
820
|
+
item: Optional[NodeItem] = None,
|
|
821
|
+
list_level: int = 0,
|
|
822
|
+
is_inline_scope: bool = False,
|
|
823
|
+
visited: Optional[set[str]] = None,
|
|
824
|
+
**kwargs: Any,
|
|
825
|
+
) -> SerializationResult:
|
|
826
|
+
"""Serialize a given node."""
|
|
827
|
+
return super().serialize(
|
|
828
|
+
item=item,
|
|
829
|
+
list_level=list_level,
|
|
830
|
+
is_inline_scope=is_inline_scope,
|
|
831
|
+
visited=visited,
|
|
832
|
+
**(dict(delim="\n\n") | kwargs),
|
|
833
|
+
)
|
|
@@ -9,6 +9,8 @@ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
|
|
|
9
9
|
from .document import (
|
|
10
10
|
AnyTableCell,
|
|
11
11
|
BaseAnnotation,
|
|
12
|
+
BaseMeta,
|
|
13
|
+
BasePrediction,
|
|
12
14
|
ChartBar,
|
|
13
15
|
ChartLine,
|
|
14
16
|
ChartPoint,
|
|
@@ -17,12 +19,14 @@ from .document import (
|
|
|
17
19
|
CodeItem,
|
|
18
20
|
ContentLayer,
|
|
19
21
|
DescriptionAnnotation,
|
|
22
|
+
DescriptionMetaField,
|
|
20
23
|
DocItem,
|
|
21
24
|
DoclingDocument,
|
|
22
25
|
DocTagsDocument,
|
|
23
26
|
DocTagsPage,
|
|
24
27
|
DocumentOrigin,
|
|
25
28
|
FloatingItem,
|
|
29
|
+
FloatingMeta,
|
|
26
30
|
Formatting,
|
|
27
31
|
FormItem,
|
|
28
32
|
FormulaItem,
|
|
@@ -35,7 +39,10 @@ from .document import (
|
|
|
35
39
|
KeyValueItem,
|
|
36
40
|
ListGroup,
|
|
37
41
|
ListItem,
|
|
42
|
+
MetaFieldName,
|
|
43
|
+
MetaUtils,
|
|
38
44
|
MiscAnnotation,
|
|
45
|
+
MoleculeMetaField,
|
|
39
46
|
NodeItem,
|
|
40
47
|
OrderedList,
|
|
41
48
|
PageItem,
|
|
@@ -43,9 +50,11 @@ from .document import (
|
|
|
43
50
|
PictureChartData,
|
|
44
51
|
PictureClassificationClass,
|
|
45
52
|
PictureClassificationData,
|
|
53
|
+
PictureClassificationMetaField,
|
|
46
54
|
PictureDataType,
|
|
47
55
|
PictureItem,
|
|
48
56
|
PictureLineChartData,
|
|
57
|
+
PictureMeta,
|
|
49
58
|
PictureMoleculeData,
|
|
50
59
|
PicturePieChartData,
|
|
51
60
|
PictureScatterChartData,
|
|
@@ -56,9 +65,11 @@ from .document import (
|
|
|
56
65
|
RichTableCell,
|
|
57
66
|
Script,
|
|
58
67
|
SectionHeaderItem,
|
|
68
|
+
SummaryMetaField,
|
|
59
69
|
TableCell,
|
|
60
70
|
TableData,
|
|
61
71
|
TableItem,
|
|
72
|
+
TabularChartMetaField,
|
|
62
73
|
TextItem,
|
|
63
74
|
TitleItem,
|
|
64
75
|
UnorderedList,
|