docling-core 2.48.4__py3-none-any.whl → 2.50.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/cli/view.py +21 -5
- docling_core/transforms/serializer/base.py +31 -0
- docling_core/transforms/serializer/common.py +180 -100
- docling_core/transforms/serializer/doctags.py +35 -20
- docling_core/transforms/serializer/html.py +78 -3
- docling_core/transforms/serializer/markdown.py +114 -5
- docling_core/types/doc/__init__.py +11 -0
- docling_core/types/doc/document.py +359 -8
- docling_core/types/doc/tokens.py +6 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/METADATA +9 -4
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/RECORD +15 -15
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/WHEEL +0 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ from xml.sax.saxutils import unescape
|
|
|
17
17
|
|
|
18
18
|
import latex2mathml.converter
|
|
19
19
|
from PIL.Image import Image
|
|
20
|
-
from pydantic import AnyUrl, BaseModel
|
|
20
|
+
from pydantic import AnyUrl, BaseModel, Field
|
|
21
21
|
from typing_extensions import override
|
|
22
22
|
|
|
23
23
|
from docling_core.transforms.serializer.base import (
|
|
@@ -28,6 +28,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
28
28
|
BaseInlineSerializer,
|
|
29
29
|
BaseKeyValueSerializer,
|
|
30
30
|
BaseListSerializer,
|
|
31
|
+
BaseMetaSerializer,
|
|
31
32
|
BasePictureSerializer,
|
|
32
33
|
BaseTableSerializer,
|
|
33
34
|
BaseTextSerializer,
|
|
@@ -46,9 +47,11 @@ from docling_core.transforms.serializer.html_styles import (
|
|
|
46
47
|
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
47
48
|
from docling_core.types.doc.base import ImageRefMode
|
|
48
49
|
from docling_core.types.doc.document import (
|
|
50
|
+
BaseMeta,
|
|
49
51
|
CodeItem,
|
|
50
52
|
ContentLayer,
|
|
51
53
|
DescriptionAnnotation,
|
|
54
|
+
DescriptionMetaField,
|
|
52
55
|
DocItem,
|
|
53
56
|
DoclingDocument,
|
|
54
57
|
FloatingItem,
|
|
@@ -61,14 +64,18 @@ from docling_core.types.doc.document import (
|
|
|
61
64
|
KeyValueItem,
|
|
62
65
|
ListGroup,
|
|
63
66
|
ListItem,
|
|
67
|
+
MoleculeMetaField,
|
|
64
68
|
NodeItem,
|
|
65
69
|
PictureClassificationData,
|
|
70
|
+
PictureClassificationMetaField,
|
|
66
71
|
PictureItem,
|
|
67
72
|
PictureMoleculeData,
|
|
68
73
|
PictureTabularChartData,
|
|
69
74
|
RichTableCell,
|
|
70
75
|
SectionHeaderItem,
|
|
76
|
+
SummaryMetaField,
|
|
71
77
|
TableItem,
|
|
78
|
+
TabularChartMetaField,
|
|
72
79
|
TextItem,
|
|
73
80
|
TitleItem,
|
|
74
81
|
)
|
|
@@ -115,7 +122,11 @@ class HTMLParams(CommonParams):
|
|
|
115
122
|
# Enable charts to be printed into HTML as tables
|
|
116
123
|
enable_chart_tables: bool = True
|
|
117
124
|
|
|
118
|
-
include_annotations: bool =
|
|
125
|
+
include_annotations: bool = Field(
|
|
126
|
+
default=True,
|
|
127
|
+
description="Include item annotations.",
|
|
128
|
+
deprecated="Use include_meta instead.",
|
|
129
|
+
)
|
|
119
130
|
|
|
120
131
|
show_original_list_item_marker: bool = True
|
|
121
132
|
|
|
@@ -808,6 +819,65 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
808
819
|
)
|
|
809
820
|
|
|
810
821
|
|
|
822
|
+
class HTMLMetaSerializer(BaseModel, BaseMetaSerializer):
|
|
823
|
+
"""HTML-specific meta serializer."""
|
|
824
|
+
|
|
825
|
+
@override
|
|
826
|
+
def serialize(
|
|
827
|
+
self,
|
|
828
|
+
*,
|
|
829
|
+
item: NodeItem,
|
|
830
|
+
doc: DoclingDocument,
|
|
831
|
+
**kwargs: Any,
|
|
832
|
+
) -> SerializationResult:
|
|
833
|
+
"""Serialize the item's meta."""
|
|
834
|
+
params = HTMLParams(**kwargs)
|
|
835
|
+
return create_ser_result(
|
|
836
|
+
text="\n".join(
|
|
837
|
+
[
|
|
838
|
+
tmp
|
|
839
|
+
for key in (
|
|
840
|
+
list(item.meta.__class__.model_fields)
|
|
841
|
+
+ list(item.meta.get_custom_part())
|
|
842
|
+
)
|
|
843
|
+
if (
|
|
844
|
+
(
|
|
845
|
+
params.allowed_meta_names is None
|
|
846
|
+
or key in params.allowed_meta_names
|
|
847
|
+
)
|
|
848
|
+
and (key not in params.blocked_meta_names)
|
|
849
|
+
and (tmp := self._serialize_meta_field(item.meta, key))
|
|
850
|
+
)
|
|
851
|
+
]
|
|
852
|
+
if item.meta
|
|
853
|
+
else []
|
|
854
|
+
),
|
|
855
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
856
|
+
# NOTE for now using an empty span source for GroupItems
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
|
|
860
|
+
if (field_val := getattr(meta, name)) is not None:
|
|
861
|
+
if isinstance(field_val, SummaryMetaField):
|
|
862
|
+
txt = field_val.text
|
|
863
|
+
elif isinstance(field_val, DescriptionMetaField):
|
|
864
|
+
txt = field_val.text
|
|
865
|
+
elif isinstance(field_val, PictureClassificationMetaField):
|
|
866
|
+
txt = self._humanize_text(field_val.get_main_prediction().class_name)
|
|
867
|
+
elif isinstance(field_val, MoleculeMetaField):
|
|
868
|
+
txt = field_val.smi
|
|
869
|
+
elif isinstance(field_val, TabularChartMetaField):
|
|
870
|
+
# suppressing tabular chart serialization
|
|
871
|
+
return None
|
|
872
|
+
elif tmp := str(field_val or ""):
|
|
873
|
+
txt = tmp
|
|
874
|
+
else:
|
|
875
|
+
return None
|
|
876
|
+
return f"<div data-meta-{name}>{txt}</div>"
|
|
877
|
+
else:
|
|
878
|
+
return None
|
|
879
|
+
|
|
880
|
+
|
|
811
881
|
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
812
882
|
"""HTML-specific annotation serializer."""
|
|
813
883
|
|
|
@@ -858,6 +928,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
858
928
|
list_serializer: BaseListSerializer = HTMLListSerializer()
|
|
859
929
|
inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
|
|
860
930
|
|
|
931
|
+
meta_serializer: BaseMetaSerializer = HTMLMetaSerializer()
|
|
861
932
|
annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
|
|
862
933
|
|
|
863
934
|
params: HTMLParams = HTMLParams()
|
|
@@ -1047,7 +1118,11 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1047
1118
|
)
|
|
1048
1119
|
results.append(cap_ser_res)
|
|
1049
1120
|
|
|
1050
|
-
if
|
|
1121
|
+
if (
|
|
1122
|
+
params.use_legacy_annotations
|
|
1123
|
+
and params.include_annotations
|
|
1124
|
+
and item.self_ref not in excluded_refs
|
|
1125
|
+
):
|
|
1051
1126
|
if isinstance(item, (PictureItem, TableItem)):
|
|
1052
1127
|
ann_res = self.serialize_annotations(
|
|
1053
1128
|
item=item,
|
|
@@ -11,7 +11,7 @@ from enum import Enum
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any, Optional, Union
|
|
13
13
|
|
|
14
|
-
from pydantic import AnyUrl, BaseModel, PositiveInt
|
|
14
|
+
from pydantic import AnyUrl, BaseModel, Field, PositiveInt
|
|
15
15
|
from tabulate import tabulate
|
|
16
16
|
from typing_extensions import override
|
|
17
17
|
|
|
@@ -23,6 +23,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
23
23
|
BaseInlineSerializer,
|
|
24
24
|
BaseKeyValueSerializer,
|
|
25
25
|
BaseListSerializer,
|
|
26
|
+
BaseMetaSerializer,
|
|
26
27
|
BasePictureSerializer,
|
|
27
28
|
BaseTableSerializer,
|
|
28
29
|
BaseTextSerializer,
|
|
@@ -36,10 +37,13 @@ from docling_core.transforms.serializer.common import (
|
|
|
36
37
|
)
|
|
37
38
|
from docling_core.types.doc.base import ImageRefMode
|
|
38
39
|
from docling_core.types.doc.document import (
|
|
40
|
+
BaseMeta,
|
|
39
41
|
CodeItem,
|
|
40
42
|
ContentLayer,
|
|
41
43
|
DescriptionAnnotation,
|
|
44
|
+
DescriptionMetaField,
|
|
42
45
|
DocItem,
|
|
46
|
+
DocItemLabel,
|
|
43
47
|
DoclingDocument,
|
|
44
48
|
FloatingItem,
|
|
45
49
|
Formatting,
|
|
@@ -51,14 +55,18 @@ from docling_core.types.doc.document import (
|
|
|
51
55
|
KeyValueItem,
|
|
52
56
|
ListGroup,
|
|
53
57
|
ListItem,
|
|
58
|
+
MoleculeMetaField,
|
|
54
59
|
NodeItem,
|
|
55
60
|
PictureClassificationData,
|
|
61
|
+
PictureClassificationMetaField,
|
|
56
62
|
PictureItem,
|
|
57
63
|
PictureMoleculeData,
|
|
58
64
|
PictureTabularChartData,
|
|
59
65
|
RichTableCell,
|
|
60
66
|
SectionHeaderItem,
|
|
67
|
+
SummaryMetaField,
|
|
61
68
|
TableItem,
|
|
69
|
+
TabularChartMetaField,
|
|
62
70
|
TextItem,
|
|
63
71
|
TitleItem,
|
|
64
72
|
)
|
|
@@ -101,8 +109,17 @@ class MarkdownParams(CommonParams):
|
|
|
101
109
|
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
|
|
102
110
|
escape_underscores: bool = True
|
|
103
111
|
escape_html: bool = True
|
|
104
|
-
|
|
105
|
-
|
|
112
|
+
mark_meta: bool = Field(default=False, description="Mark meta sections.")
|
|
113
|
+
include_annotations: bool = Field(
|
|
114
|
+
default=True,
|
|
115
|
+
description="Include item annotations.",
|
|
116
|
+
deprecated="Use include_meta instead.",
|
|
117
|
+
)
|
|
118
|
+
mark_annotations: bool = Field(
|
|
119
|
+
default=False,
|
|
120
|
+
description="Mark annotation sections.",
|
|
121
|
+
deprecated="Use mark_meta instead.",
|
|
122
|
+
)
|
|
106
123
|
orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
|
|
107
124
|
ensure_valid_list_item_marker: bool = True
|
|
108
125
|
|
|
@@ -140,6 +157,10 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
140
157
|
text = item.text
|
|
141
158
|
processing_pending = True
|
|
142
159
|
|
|
160
|
+
if item.label == DocItemLabel.CHECKBOX_SELECTED:
|
|
161
|
+
text = f"- [x] {text}"
|
|
162
|
+
if item.label == DocItemLabel.CHECKBOX_UNSELECTED:
|
|
163
|
+
text = f"- [ ] {text}"
|
|
143
164
|
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
|
|
144
165
|
if not has_inline_repr:
|
|
145
166
|
# case where processing/formatting should be applied first (in inner scope)
|
|
@@ -240,9 +261,77 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
240
261
|
return create_ser_result(text=text, span_source=res_parts)
|
|
241
262
|
|
|
242
263
|
|
|
264
|
+
class MarkdownMetaSerializer(BaseModel, BaseMetaSerializer):
|
|
265
|
+
"""Markdown-specific meta serializer."""
|
|
266
|
+
|
|
267
|
+
@override
|
|
268
|
+
def serialize(
|
|
269
|
+
self,
|
|
270
|
+
*,
|
|
271
|
+
item: NodeItem,
|
|
272
|
+
doc: DoclingDocument,
|
|
273
|
+
**kwargs: Any,
|
|
274
|
+
) -> SerializationResult:
|
|
275
|
+
"""Serialize the item's meta."""
|
|
276
|
+
params = MarkdownParams(**kwargs)
|
|
277
|
+
return create_ser_result(
|
|
278
|
+
text="\n\n".join(
|
|
279
|
+
[
|
|
280
|
+
tmp
|
|
281
|
+
for key in (
|
|
282
|
+
list(item.meta.__class__.model_fields)
|
|
283
|
+
+ list(item.meta.get_custom_part())
|
|
284
|
+
)
|
|
285
|
+
if (
|
|
286
|
+
(
|
|
287
|
+
params.allowed_meta_names is None
|
|
288
|
+
or key in params.allowed_meta_names
|
|
289
|
+
)
|
|
290
|
+
and (key not in params.blocked_meta_names)
|
|
291
|
+
and (
|
|
292
|
+
tmp := self._serialize_meta_field(
|
|
293
|
+
item.meta, key, params.mark_meta
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
]
|
|
298
|
+
if item.meta
|
|
299
|
+
else []
|
|
300
|
+
),
|
|
301
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
302
|
+
# NOTE for now using an empty span source for GroupItems
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
def _serialize_meta_field(
|
|
306
|
+
self, meta: BaseMeta, name: str, mark_meta: bool
|
|
307
|
+
) -> Optional[str]:
|
|
308
|
+
if (field_val := getattr(meta, name)) is not None:
|
|
309
|
+
if isinstance(field_val, SummaryMetaField):
|
|
310
|
+
txt = field_val.text
|
|
311
|
+
elif isinstance(field_val, DescriptionMetaField):
|
|
312
|
+
txt = field_val.text
|
|
313
|
+
elif isinstance(field_val, PictureClassificationMetaField):
|
|
314
|
+
txt = self._humanize_text(field_val.get_main_prediction().class_name)
|
|
315
|
+
elif isinstance(field_val, MoleculeMetaField):
|
|
316
|
+
txt = field_val.smi
|
|
317
|
+
elif isinstance(field_val, TabularChartMetaField):
|
|
318
|
+
# suppressing tabular chart serialization
|
|
319
|
+
return None
|
|
320
|
+
elif tmp := str(field_val or ""):
|
|
321
|
+
txt = tmp
|
|
322
|
+
else:
|
|
323
|
+
return None
|
|
324
|
+
return (
|
|
325
|
+
f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
|
|
243
331
|
class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
244
332
|
"""Markdown-specific annotation serializer."""
|
|
245
333
|
|
|
334
|
+
@override
|
|
246
335
|
def serialize(
|
|
247
336
|
self,
|
|
248
337
|
*,
|
|
@@ -308,7 +397,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
308
397
|
|
|
309
398
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
310
399
|
|
|
311
|
-
if params.include_annotations:
|
|
400
|
+
if params.use_legacy_annotations and params.include_annotations:
|
|
312
401
|
|
|
313
402
|
ann_res = doc_serializer.serialize_annotations(
|
|
314
403
|
item=item,
|
|
@@ -377,7 +466,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
377
466
|
res_parts.append(cap_res)
|
|
378
467
|
|
|
379
468
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
380
|
-
if params.include_annotations:
|
|
469
|
+
if params.use_legacy_annotations and params.include_annotations:
|
|
381
470
|
ann_res = doc_serializer.serialize_annotations(
|
|
382
471
|
item=item,
|
|
383
472
|
**kwargs,
|
|
@@ -624,6 +713,7 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
624
713
|
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
625
714
|
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
626
715
|
|
|
716
|
+
meta_serializer: BaseMetaSerializer = MarkdownMetaSerializer()
|
|
627
717
|
annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
|
|
628
718
|
|
|
629
719
|
params: MarkdownParams = MarkdownParams()
|
|
@@ -722,3 +812,22 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
722
812
|
def requires_page_break(self) -> bool:
|
|
723
813
|
"""Whether to add page breaks."""
|
|
724
814
|
return self.params.page_break_placeholder is not None
|
|
815
|
+
|
|
816
|
+
@override
|
|
817
|
+
def serialize(
|
|
818
|
+
self,
|
|
819
|
+
*,
|
|
820
|
+
item: Optional[NodeItem] = None,
|
|
821
|
+
list_level: int = 0,
|
|
822
|
+
is_inline_scope: bool = False,
|
|
823
|
+
visited: Optional[set[str]] = None,
|
|
824
|
+
**kwargs: Any,
|
|
825
|
+
) -> SerializationResult:
|
|
826
|
+
"""Serialize a given node."""
|
|
827
|
+
return super().serialize(
|
|
828
|
+
item=item,
|
|
829
|
+
list_level=list_level,
|
|
830
|
+
is_inline_scope=is_inline_scope,
|
|
831
|
+
visited=visited,
|
|
832
|
+
**(dict(delim="\n\n") | kwargs),
|
|
833
|
+
)
|
|
@@ -9,6 +9,8 @@ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
|
|
|
9
9
|
from .document import (
|
|
10
10
|
AnyTableCell,
|
|
11
11
|
BaseAnnotation,
|
|
12
|
+
BaseMeta,
|
|
13
|
+
BasePrediction,
|
|
12
14
|
ChartBar,
|
|
13
15
|
ChartLine,
|
|
14
16
|
ChartPoint,
|
|
@@ -17,12 +19,14 @@ from .document import (
|
|
|
17
19
|
CodeItem,
|
|
18
20
|
ContentLayer,
|
|
19
21
|
DescriptionAnnotation,
|
|
22
|
+
DescriptionMetaField,
|
|
20
23
|
DocItem,
|
|
21
24
|
DoclingDocument,
|
|
22
25
|
DocTagsDocument,
|
|
23
26
|
DocTagsPage,
|
|
24
27
|
DocumentOrigin,
|
|
25
28
|
FloatingItem,
|
|
29
|
+
FloatingMeta,
|
|
26
30
|
Formatting,
|
|
27
31
|
FormItem,
|
|
28
32
|
FormulaItem,
|
|
@@ -35,7 +39,10 @@ from .document import (
|
|
|
35
39
|
KeyValueItem,
|
|
36
40
|
ListGroup,
|
|
37
41
|
ListItem,
|
|
42
|
+
MetaFieldName,
|
|
43
|
+
MetaUtils,
|
|
38
44
|
MiscAnnotation,
|
|
45
|
+
MoleculeMetaField,
|
|
39
46
|
NodeItem,
|
|
40
47
|
OrderedList,
|
|
41
48
|
PageItem,
|
|
@@ -43,9 +50,11 @@ from .document import (
|
|
|
43
50
|
PictureChartData,
|
|
44
51
|
PictureClassificationClass,
|
|
45
52
|
PictureClassificationData,
|
|
53
|
+
PictureClassificationMetaField,
|
|
46
54
|
PictureDataType,
|
|
47
55
|
PictureItem,
|
|
48
56
|
PictureLineChartData,
|
|
57
|
+
PictureMeta,
|
|
49
58
|
PictureMoleculeData,
|
|
50
59
|
PicturePieChartData,
|
|
51
60
|
PictureScatterChartData,
|
|
@@ -56,9 +65,11 @@ from .document import (
|
|
|
56
65
|
RichTableCell,
|
|
57
66
|
Script,
|
|
58
67
|
SectionHeaderItem,
|
|
68
|
+
SummaryMetaField,
|
|
59
69
|
TableCell,
|
|
60
70
|
TableData,
|
|
61
71
|
TableItem,
|
|
72
|
+
TabularChartMetaField,
|
|
62
73
|
TextItem,
|
|
63
74
|
TitleItem,
|
|
64
75
|
UnorderedList,
|