docling-core 2.49.0__py3-none-any.whl → 2.50.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -17,7 +17,7 @@ from xml.sax.saxutils import unescape
17
17
 
18
18
  import latex2mathml.converter
19
19
  from PIL.Image import Image
20
- from pydantic import AnyUrl, BaseModel
20
+ from pydantic import AnyUrl, BaseModel, Field
21
21
  from typing_extensions import override
22
22
 
23
23
  from docling_core.transforms.serializer.base import (
@@ -28,6 +28,7 @@ from docling_core.transforms.serializer.base import (
28
28
  BaseInlineSerializer,
29
29
  BaseKeyValueSerializer,
30
30
  BaseListSerializer,
31
+ BaseMetaSerializer,
31
32
  BasePictureSerializer,
32
33
  BaseTableSerializer,
33
34
  BaseTextSerializer,
@@ -46,9 +47,11 @@ from docling_core.transforms.serializer.html_styles import (
46
47
  from docling_core.transforms.visualizer.base import BaseVisualizer
47
48
  from docling_core.types.doc.base import ImageRefMode
48
49
  from docling_core.types.doc.document import (
50
+ BaseMeta,
49
51
  CodeItem,
50
52
  ContentLayer,
51
53
  DescriptionAnnotation,
54
+ DescriptionMetaField,
52
55
  DocItem,
53
56
  DoclingDocument,
54
57
  FloatingItem,
@@ -61,14 +64,18 @@ from docling_core.types.doc.document import (
61
64
  KeyValueItem,
62
65
  ListGroup,
63
66
  ListItem,
67
+ MoleculeMetaField,
64
68
  NodeItem,
65
69
  PictureClassificationData,
70
+ PictureClassificationMetaField,
66
71
  PictureItem,
67
72
  PictureMoleculeData,
68
73
  PictureTabularChartData,
69
74
  RichTableCell,
70
75
  SectionHeaderItem,
76
+ SummaryMetaField,
71
77
  TableItem,
78
+ TabularChartMetaField,
72
79
  TextItem,
73
80
  TitleItem,
74
81
  )
@@ -115,7 +122,11 @@ class HTMLParams(CommonParams):
115
122
  # Enable charts to be printed into HTML as tables
116
123
  enable_chart_tables: bool = True
117
124
 
118
- include_annotations: bool = True
125
+ include_annotations: bool = Field(
126
+ default=True,
127
+ description="Include item annotations.",
128
+ deprecated="Use include_meta instead.",
129
+ )
119
130
 
120
131
  show_original_list_item_marker: bool = True
121
132
 
@@ -808,6 +819,65 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
808
819
  )
809
820
 
810
821
 
822
+ class HTMLMetaSerializer(BaseModel, BaseMetaSerializer):
823
+ """HTML-specific meta serializer."""
824
+
825
+ @override
826
+ def serialize(
827
+ self,
828
+ *,
829
+ item: NodeItem,
830
+ doc: DoclingDocument,
831
+ **kwargs: Any,
832
+ ) -> SerializationResult:
833
+ """Serialize the item's meta."""
834
+ params = HTMLParams(**kwargs)
835
+ return create_ser_result(
836
+ text="\n".join(
837
+ [
838
+ tmp
839
+ for key in (
840
+ list(item.meta.__class__.model_fields)
841
+ + list(item.meta.get_custom_part())
842
+ )
843
+ if (
844
+ (
845
+ params.allowed_meta_names is None
846
+ or key in params.allowed_meta_names
847
+ )
848
+ and (key not in params.blocked_meta_names)
849
+ and (tmp := self._serialize_meta_field(item.meta, key))
850
+ )
851
+ ]
852
+ if item.meta
853
+ else []
854
+ ),
855
+ span_source=item if isinstance(item, DocItem) else [],
856
+ # NOTE for now using an empty span source for GroupItems
857
+ )
858
+
859
+ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
860
+ if (field_val := getattr(meta, name)) is not None:
861
+ if isinstance(field_val, SummaryMetaField):
862
+ txt = field_val.text
863
+ elif isinstance(field_val, DescriptionMetaField):
864
+ txt = field_val.text
865
+ elif isinstance(field_val, PictureClassificationMetaField):
866
+ txt = self._humanize_text(field_val.get_main_prediction().class_name)
867
+ elif isinstance(field_val, MoleculeMetaField):
868
+ txt = field_val.smi
869
+ elif isinstance(field_val, TabularChartMetaField):
870
+ # suppressing tabular chart serialization
871
+ return None
872
+ elif tmp := str(field_val or ""):
873
+ txt = tmp
874
+ else:
875
+ return None
876
+ return f"<div data-meta-{name}>{txt}</div>"
877
+ else:
878
+ return None
879
+
880
+
811
881
  class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
812
882
  """HTML-specific annotation serializer."""
813
883
 
@@ -858,6 +928,7 @@ class HTMLDocSerializer(DocSerializer):
858
928
  list_serializer: BaseListSerializer = HTMLListSerializer()
859
929
  inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
860
930
 
931
+ meta_serializer: BaseMetaSerializer = HTMLMetaSerializer()
861
932
  annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
862
933
 
863
934
  params: HTMLParams = HTMLParams()
@@ -1047,7 +1118,11 @@ class HTMLDocSerializer(DocSerializer):
1047
1118
  )
1048
1119
  results.append(cap_ser_res)
1049
1120
 
1050
- if params.include_annotations and item.self_ref not in excluded_refs:
1121
+ if (
1122
+ params.use_legacy_annotations
1123
+ and params.include_annotations
1124
+ and item.self_ref not in excluded_refs
1125
+ ):
1051
1126
  if isinstance(item, (PictureItem, TableItem)):
1052
1127
  ann_res = self.serialize_annotations(
1053
1128
  item=item,
@@ -11,7 +11,7 @@ from enum import Enum
11
11
  from pathlib import Path
12
12
  from typing import Any, Optional, Union
13
13
 
14
- from pydantic import AnyUrl, BaseModel, PositiveInt
14
+ from pydantic import AnyUrl, BaseModel, Field, PositiveInt
15
15
  from tabulate import tabulate
16
16
  from typing_extensions import override
17
17
 
@@ -23,6 +23,7 @@ from docling_core.transforms.serializer.base import (
23
23
  BaseInlineSerializer,
24
24
  BaseKeyValueSerializer,
25
25
  BaseListSerializer,
26
+ BaseMetaSerializer,
26
27
  BasePictureSerializer,
27
28
  BaseTableSerializer,
28
29
  BaseTextSerializer,
@@ -36,9 +37,11 @@ from docling_core.transforms.serializer.common import (
36
37
  )
37
38
  from docling_core.types.doc.base import ImageRefMode
38
39
  from docling_core.types.doc.document import (
40
+ BaseMeta,
39
41
  CodeItem,
40
42
  ContentLayer,
41
43
  DescriptionAnnotation,
44
+ DescriptionMetaField,
42
45
  DocItem,
43
46
  DocItemLabel,
44
47
  DoclingDocument,
@@ -52,14 +55,18 @@ from docling_core.types.doc.document import (
52
55
  KeyValueItem,
53
56
  ListGroup,
54
57
  ListItem,
58
+ MoleculeMetaField,
55
59
  NodeItem,
56
60
  PictureClassificationData,
61
+ PictureClassificationMetaField,
57
62
  PictureItem,
58
63
  PictureMoleculeData,
59
64
  PictureTabularChartData,
60
65
  RichTableCell,
61
66
  SectionHeaderItem,
67
+ SummaryMetaField,
62
68
  TableItem,
69
+ TabularChartMetaField,
63
70
  TextItem,
64
71
  TitleItem,
65
72
  )
@@ -102,8 +109,17 @@ class MarkdownParams(CommonParams):
102
109
  page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
103
110
  escape_underscores: bool = True
104
111
  escape_html: bool = True
105
- include_annotations: bool = True
106
- mark_annotations: bool = False
112
+ mark_meta: bool = Field(default=False, description="Mark meta sections.")
113
+ include_annotations: bool = Field(
114
+ default=True,
115
+ description="Include item annotations.",
116
+ deprecated="Use include_meta instead.",
117
+ )
118
+ mark_annotations: bool = Field(
119
+ default=False,
120
+ description="Mark annotation sections.",
121
+ deprecated="Use mark_meta instead.",
122
+ )
107
123
  orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
108
124
  ensure_valid_list_item_marker: bool = True
109
125
 
@@ -245,9 +261,77 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
245
261
  return create_ser_result(text=text, span_source=res_parts)
246
262
 
247
263
 
264
+ class MarkdownMetaSerializer(BaseModel, BaseMetaSerializer):
265
+ """Markdown-specific meta serializer."""
266
+
267
+ @override
268
+ def serialize(
269
+ self,
270
+ *,
271
+ item: NodeItem,
272
+ doc: DoclingDocument,
273
+ **kwargs: Any,
274
+ ) -> SerializationResult:
275
+ """Serialize the item's meta."""
276
+ params = MarkdownParams(**kwargs)
277
+ return create_ser_result(
278
+ text="\n\n".join(
279
+ [
280
+ tmp
281
+ for key in (
282
+ list(item.meta.__class__.model_fields)
283
+ + list(item.meta.get_custom_part())
284
+ )
285
+ if (
286
+ (
287
+ params.allowed_meta_names is None
288
+ or key in params.allowed_meta_names
289
+ )
290
+ and (key not in params.blocked_meta_names)
291
+ and (
292
+ tmp := self._serialize_meta_field(
293
+ item.meta, key, params.mark_meta
294
+ )
295
+ )
296
+ )
297
+ ]
298
+ if item.meta
299
+ else []
300
+ ),
301
+ span_source=item if isinstance(item, DocItem) else [],
302
+ # NOTE for now using an empty span source for GroupItems
303
+ )
304
+
305
+ def _serialize_meta_field(
306
+ self, meta: BaseMeta, name: str, mark_meta: bool
307
+ ) -> Optional[str]:
308
+ if (field_val := getattr(meta, name)) is not None:
309
+ if isinstance(field_val, SummaryMetaField):
310
+ txt = field_val.text
311
+ elif isinstance(field_val, DescriptionMetaField):
312
+ txt = field_val.text
313
+ elif isinstance(field_val, PictureClassificationMetaField):
314
+ txt = self._humanize_text(field_val.get_main_prediction().class_name)
315
+ elif isinstance(field_val, MoleculeMetaField):
316
+ txt = field_val.smi
317
+ elif isinstance(field_val, TabularChartMetaField):
318
+ # suppressing tabular chart serialization
319
+ return None
320
+ elif tmp := str(field_val or ""):
321
+ txt = tmp
322
+ else:
323
+ return None
324
+ return (
325
+ f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt
326
+ )
327
+ else:
328
+ return None
329
+
330
+
248
331
  class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
249
332
  """Markdown-specific annotation serializer."""
250
333
 
334
+ @override
251
335
  def serialize(
252
336
  self,
253
337
  *,
@@ -313,7 +397,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
313
397
 
314
398
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
315
399
 
316
- if params.include_annotations:
400
+ if params.use_legacy_annotations and params.include_annotations:
317
401
 
318
402
  ann_res = doc_serializer.serialize_annotations(
319
403
  item=item,
@@ -382,7 +466,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
382
466
  res_parts.append(cap_res)
383
467
 
384
468
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
385
- if params.include_annotations:
469
+ if params.use_legacy_annotations and params.include_annotations:
386
470
  ann_res = doc_serializer.serialize_annotations(
387
471
  item=item,
388
472
  **kwargs,
@@ -629,6 +713,7 @@ class MarkdownDocSerializer(DocSerializer):
629
713
  list_serializer: BaseListSerializer = MarkdownListSerializer()
630
714
  inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
631
715
 
716
+ meta_serializer: BaseMetaSerializer = MarkdownMetaSerializer()
632
717
  annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
633
718
 
634
719
  params: MarkdownParams = MarkdownParams()
@@ -727,3 +812,22 @@ class MarkdownDocSerializer(DocSerializer):
727
812
  def requires_page_break(self) -> bool:
728
813
  """Whether to add page breaks."""
729
814
  return self.params.page_break_placeholder is not None
815
+
816
+ @override
817
+ def serialize(
818
+ self,
819
+ *,
820
+ item: Optional[NodeItem] = None,
821
+ list_level: int = 0,
822
+ is_inline_scope: bool = False,
823
+ visited: Optional[set[str]] = None,
824
+ **kwargs: Any,
825
+ ) -> SerializationResult:
826
+ """Serialize a given node."""
827
+ return super().serialize(
828
+ item=item,
829
+ list_level=list_level,
830
+ is_inline_scope=is_inline_scope,
831
+ visited=visited,
832
+ **(dict(delim="\n\n") | kwargs),
833
+ )
@@ -9,6 +9,8 @@ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
10
  AnyTableCell,
11
11
  BaseAnnotation,
12
+ BaseMeta,
13
+ BasePrediction,
12
14
  ChartBar,
13
15
  ChartLine,
14
16
  ChartPoint,
@@ -17,12 +19,14 @@ from .document import (
17
19
  CodeItem,
18
20
  ContentLayer,
19
21
  DescriptionAnnotation,
22
+ DescriptionMetaField,
20
23
  DocItem,
21
24
  DoclingDocument,
22
25
  DocTagsDocument,
23
26
  DocTagsPage,
24
27
  DocumentOrigin,
25
28
  FloatingItem,
29
+ FloatingMeta,
26
30
  Formatting,
27
31
  FormItem,
28
32
  FormulaItem,
@@ -35,7 +39,10 @@ from .document import (
35
39
  KeyValueItem,
36
40
  ListGroup,
37
41
  ListItem,
42
+ MetaFieldName,
43
+ MetaUtils,
38
44
  MiscAnnotation,
45
+ MoleculeMetaField,
39
46
  NodeItem,
40
47
  OrderedList,
41
48
  PageItem,
@@ -43,9 +50,11 @@ from .document import (
43
50
  PictureChartData,
44
51
  PictureClassificationClass,
45
52
  PictureClassificationData,
53
+ PictureClassificationMetaField,
46
54
  PictureDataType,
47
55
  PictureItem,
48
56
  PictureLineChartData,
57
+ PictureMeta,
49
58
  PictureMoleculeData,
50
59
  PicturePieChartData,
51
60
  PictureScatterChartData,
@@ -56,9 +65,11 @@ from .document import (
56
65
  RichTableCell,
57
66
  Script,
58
67
  SectionHeaderItem,
68
+ SummaryMetaField,
59
69
  TableCell,
60
70
  TableData,
61
71
  TableItem,
72
+ TabularChartMetaField,
62
73
  TextItem,
63
74
  TitleItem,
64
75
  UnorderedList,