docling-core 2.48.4__py3-none-any.whl → 2.50.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -17,7 +17,7 @@ from xml.sax.saxutils import unescape
17
17
 
18
18
  import latex2mathml.converter
19
19
  from PIL.Image import Image
20
- from pydantic import AnyUrl, BaseModel
20
+ from pydantic import AnyUrl, BaseModel, Field
21
21
  from typing_extensions import override
22
22
 
23
23
  from docling_core.transforms.serializer.base import (
@@ -28,6 +28,7 @@ from docling_core.transforms.serializer.base import (
28
28
  BaseInlineSerializer,
29
29
  BaseKeyValueSerializer,
30
30
  BaseListSerializer,
31
+ BaseMetaSerializer,
31
32
  BasePictureSerializer,
32
33
  BaseTableSerializer,
33
34
  BaseTextSerializer,
@@ -46,9 +47,11 @@ from docling_core.transforms.serializer.html_styles import (
46
47
  from docling_core.transforms.visualizer.base import BaseVisualizer
47
48
  from docling_core.types.doc.base import ImageRefMode
48
49
  from docling_core.types.doc.document import (
50
+ BaseMeta,
49
51
  CodeItem,
50
52
  ContentLayer,
51
53
  DescriptionAnnotation,
54
+ DescriptionMetaField,
52
55
  DocItem,
53
56
  DoclingDocument,
54
57
  FloatingItem,
@@ -61,14 +64,18 @@ from docling_core.types.doc.document import (
61
64
  KeyValueItem,
62
65
  ListGroup,
63
66
  ListItem,
67
+ MoleculeMetaField,
64
68
  NodeItem,
65
69
  PictureClassificationData,
70
+ PictureClassificationMetaField,
66
71
  PictureItem,
67
72
  PictureMoleculeData,
68
73
  PictureTabularChartData,
69
74
  RichTableCell,
70
75
  SectionHeaderItem,
76
+ SummaryMetaField,
71
77
  TableItem,
78
+ TabularChartMetaField,
72
79
  TextItem,
73
80
  TitleItem,
74
81
  )
@@ -115,7 +122,11 @@ class HTMLParams(CommonParams):
115
122
  # Enable charts to be printed into HTML as tables
116
123
  enable_chart_tables: bool = True
117
124
 
118
- include_annotations: bool = True
125
+ include_annotations: bool = Field(
126
+ default=True,
127
+ description="Include item annotations.",
128
+ deprecated="Use include_meta instead.",
129
+ )
119
130
 
120
131
  show_original_list_item_marker: bool = True
121
132
 
@@ -808,6 +819,65 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
808
819
  )
809
820
 
810
821
 
822
+ class HTMLMetaSerializer(BaseModel, BaseMetaSerializer):
823
+ """HTML-specific meta serializer."""
824
+
825
+ @override
826
+ def serialize(
827
+ self,
828
+ *,
829
+ item: NodeItem,
830
+ doc: DoclingDocument,
831
+ **kwargs: Any,
832
+ ) -> SerializationResult:
833
+ """Serialize the item's meta."""
834
+ params = HTMLParams(**kwargs)
835
+ return create_ser_result(
836
+ text="\n".join(
837
+ [
838
+ tmp
839
+ for key in (
840
+ list(item.meta.__class__.model_fields)
841
+ + list(item.meta.get_custom_part())
842
+ )
843
+ if (
844
+ (
845
+ params.allowed_meta_names is None
846
+ or key in params.allowed_meta_names
847
+ )
848
+ and (key not in params.blocked_meta_names)
849
+ and (tmp := self._serialize_meta_field(item.meta, key))
850
+ )
851
+ ]
852
+ if item.meta
853
+ else []
854
+ ),
855
+ span_source=item if isinstance(item, DocItem) else [],
856
+ # NOTE for now using an empty span source for GroupItems
857
+ )
858
+
859
+ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
860
+ if (field_val := getattr(meta, name)) is not None:
861
+ if isinstance(field_val, SummaryMetaField):
862
+ txt = field_val.text
863
+ elif isinstance(field_val, DescriptionMetaField):
864
+ txt = field_val.text
865
+ elif isinstance(field_val, PictureClassificationMetaField):
866
+ txt = self._humanize_text(field_val.get_main_prediction().class_name)
867
+ elif isinstance(field_val, MoleculeMetaField):
868
+ txt = field_val.smi
869
+ elif isinstance(field_val, TabularChartMetaField):
870
+ # suppressing tabular chart serialization
871
+ return None
872
+ elif tmp := str(field_val or ""):
873
+ txt = tmp
874
+ else:
875
+ return None
876
+ return f"<div data-meta-{name}>{txt}</div>"
877
+ else:
878
+ return None
879
+
880
+
811
881
  class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
812
882
  """HTML-specific annotation serializer."""
813
883
 
@@ -858,6 +928,7 @@ class HTMLDocSerializer(DocSerializer):
858
928
  list_serializer: BaseListSerializer = HTMLListSerializer()
859
929
  inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
860
930
 
931
+ meta_serializer: BaseMetaSerializer = HTMLMetaSerializer()
861
932
  annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
862
933
 
863
934
  params: HTMLParams = HTMLParams()
@@ -1047,7 +1118,11 @@ class HTMLDocSerializer(DocSerializer):
1047
1118
  )
1048
1119
  results.append(cap_ser_res)
1049
1120
 
1050
- if params.include_annotations and item.self_ref not in excluded_refs:
1121
+ if (
1122
+ params.use_legacy_annotations
1123
+ and params.include_annotations
1124
+ and item.self_ref not in excluded_refs
1125
+ ):
1051
1126
  if isinstance(item, (PictureItem, TableItem)):
1052
1127
  ann_res = self.serialize_annotations(
1053
1128
  item=item,
@@ -11,7 +11,7 @@ from enum import Enum
11
11
  from pathlib import Path
12
12
  from typing import Any, Optional, Union
13
13
 
14
- from pydantic import AnyUrl, BaseModel, PositiveInt
14
+ from pydantic import AnyUrl, BaseModel, Field, PositiveInt
15
15
  from tabulate import tabulate
16
16
  from typing_extensions import override
17
17
 
@@ -23,6 +23,7 @@ from docling_core.transforms.serializer.base import (
23
23
  BaseInlineSerializer,
24
24
  BaseKeyValueSerializer,
25
25
  BaseListSerializer,
26
+ BaseMetaSerializer,
26
27
  BasePictureSerializer,
27
28
  BaseTableSerializer,
28
29
  BaseTextSerializer,
@@ -36,10 +37,13 @@ from docling_core.transforms.serializer.common import (
36
37
  )
37
38
  from docling_core.types.doc.base import ImageRefMode
38
39
  from docling_core.types.doc.document import (
40
+ BaseMeta,
39
41
  CodeItem,
40
42
  ContentLayer,
41
43
  DescriptionAnnotation,
44
+ DescriptionMetaField,
42
45
  DocItem,
46
+ DocItemLabel,
43
47
  DoclingDocument,
44
48
  FloatingItem,
45
49
  Formatting,
@@ -51,14 +55,18 @@ from docling_core.types.doc.document import (
51
55
  KeyValueItem,
52
56
  ListGroup,
53
57
  ListItem,
58
+ MoleculeMetaField,
54
59
  NodeItem,
55
60
  PictureClassificationData,
61
+ PictureClassificationMetaField,
56
62
  PictureItem,
57
63
  PictureMoleculeData,
58
64
  PictureTabularChartData,
59
65
  RichTableCell,
60
66
  SectionHeaderItem,
67
+ SummaryMetaField,
61
68
  TableItem,
69
+ TabularChartMetaField,
62
70
  TextItem,
63
71
  TitleItem,
64
72
  )
@@ -101,8 +109,17 @@ class MarkdownParams(CommonParams):
101
109
  page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
102
110
  escape_underscores: bool = True
103
111
  escape_html: bool = True
104
- include_annotations: bool = True
105
- mark_annotations: bool = False
112
+ mark_meta: bool = Field(default=False, description="Mark meta sections.")
113
+ include_annotations: bool = Field(
114
+ default=True,
115
+ description="Include item annotations.",
116
+ deprecated="Use include_meta instead.",
117
+ )
118
+ mark_annotations: bool = Field(
119
+ default=False,
120
+ description="Mark annotation sections.",
121
+ deprecated="Use mark_meta instead.",
122
+ )
106
123
  orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
107
124
  ensure_valid_list_item_marker: bool = True
108
125
 
@@ -140,6 +157,10 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
140
157
  text = item.text
141
158
  processing_pending = True
142
159
 
160
+ if item.label == DocItemLabel.CHECKBOX_SELECTED:
161
+ text = f"- [x] {text}"
162
+ if item.label == DocItemLabel.CHECKBOX_UNSELECTED:
163
+ text = f"- [ ] {text}"
143
164
  if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
144
165
  if not has_inline_repr:
145
166
  # case where processing/formatting should be applied first (in inner scope)
@@ -240,9 +261,77 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
240
261
  return create_ser_result(text=text, span_source=res_parts)
241
262
 
242
263
 
264
+ class MarkdownMetaSerializer(BaseModel, BaseMetaSerializer):
265
+ """Markdown-specific meta serializer."""
266
+
267
+ @override
268
+ def serialize(
269
+ self,
270
+ *,
271
+ item: NodeItem,
272
+ doc: DoclingDocument,
273
+ **kwargs: Any,
274
+ ) -> SerializationResult:
275
+ """Serialize the item's meta."""
276
+ params = MarkdownParams(**kwargs)
277
+ return create_ser_result(
278
+ text="\n\n".join(
279
+ [
280
+ tmp
281
+ for key in (
282
+ list(item.meta.__class__.model_fields)
283
+ + list(item.meta.get_custom_part())
284
+ )
285
+ if (
286
+ (
287
+ params.allowed_meta_names is None
288
+ or key in params.allowed_meta_names
289
+ )
290
+ and (key not in params.blocked_meta_names)
291
+ and (
292
+ tmp := self._serialize_meta_field(
293
+ item.meta, key, params.mark_meta
294
+ )
295
+ )
296
+ )
297
+ ]
298
+ if item.meta
299
+ else []
300
+ ),
301
+ span_source=item if isinstance(item, DocItem) else [],
302
+ # NOTE for now using an empty span source for GroupItems
303
+ )
304
+
305
+ def _serialize_meta_field(
306
+ self, meta: BaseMeta, name: str, mark_meta: bool
307
+ ) -> Optional[str]:
308
+ if (field_val := getattr(meta, name)) is not None:
309
+ if isinstance(field_val, SummaryMetaField):
310
+ txt = field_val.text
311
+ elif isinstance(field_val, DescriptionMetaField):
312
+ txt = field_val.text
313
+ elif isinstance(field_val, PictureClassificationMetaField):
314
+ txt = self._humanize_text(field_val.get_main_prediction().class_name)
315
+ elif isinstance(field_val, MoleculeMetaField):
316
+ txt = field_val.smi
317
+ elif isinstance(field_val, TabularChartMetaField):
318
+ # suppressing tabular chart serialization
319
+ return None
320
+ elif tmp := str(field_val or ""):
321
+ txt = tmp
322
+ else:
323
+ return None
324
+ return (
325
+ f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt
326
+ )
327
+ else:
328
+ return None
329
+
330
+
243
331
  class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
244
332
  """Markdown-specific annotation serializer."""
245
333
 
334
+ @override
246
335
  def serialize(
247
336
  self,
248
337
  *,
@@ -308,7 +397,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
308
397
 
309
398
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
310
399
 
311
- if params.include_annotations:
400
+ if params.use_legacy_annotations and params.include_annotations:
312
401
 
313
402
  ann_res = doc_serializer.serialize_annotations(
314
403
  item=item,
@@ -377,7 +466,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
377
466
  res_parts.append(cap_res)
378
467
 
379
468
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
380
- if params.include_annotations:
469
+ if params.use_legacy_annotations and params.include_annotations:
381
470
  ann_res = doc_serializer.serialize_annotations(
382
471
  item=item,
383
472
  **kwargs,
@@ -624,6 +713,7 @@ class MarkdownDocSerializer(DocSerializer):
624
713
  list_serializer: BaseListSerializer = MarkdownListSerializer()
625
714
  inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
626
715
 
716
+ meta_serializer: BaseMetaSerializer = MarkdownMetaSerializer()
627
717
  annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
628
718
 
629
719
  params: MarkdownParams = MarkdownParams()
@@ -722,3 +812,22 @@ class MarkdownDocSerializer(DocSerializer):
722
812
  def requires_page_break(self) -> bool:
723
813
  """Whether to add page breaks."""
724
814
  return self.params.page_break_placeholder is not None
815
+
816
+ @override
817
+ def serialize(
818
+ self,
819
+ *,
820
+ item: Optional[NodeItem] = None,
821
+ list_level: int = 0,
822
+ is_inline_scope: bool = False,
823
+ visited: Optional[set[str]] = None,
824
+ **kwargs: Any,
825
+ ) -> SerializationResult:
826
+ """Serialize a given node."""
827
+ return super().serialize(
828
+ item=item,
829
+ list_level=list_level,
830
+ is_inline_scope=is_inline_scope,
831
+ visited=visited,
832
+ **(dict(delim="\n\n") | kwargs),
833
+ )
@@ -9,6 +9,8 @@ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
10
  AnyTableCell,
11
11
  BaseAnnotation,
12
+ BaseMeta,
13
+ BasePrediction,
12
14
  ChartBar,
13
15
  ChartLine,
14
16
  ChartPoint,
@@ -17,12 +19,14 @@ from .document import (
17
19
  CodeItem,
18
20
  ContentLayer,
19
21
  DescriptionAnnotation,
22
+ DescriptionMetaField,
20
23
  DocItem,
21
24
  DoclingDocument,
22
25
  DocTagsDocument,
23
26
  DocTagsPage,
24
27
  DocumentOrigin,
25
28
  FloatingItem,
29
+ FloatingMeta,
26
30
  Formatting,
27
31
  FormItem,
28
32
  FormulaItem,
@@ -35,7 +39,10 @@ from .document import (
35
39
  KeyValueItem,
36
40
  ListGroup,
37
41
  ListItem,
42
+ MetaFieldName,
43
+ MetaUtils,
38
44
  MiscAnnotation,
45
+ MoleculeMetaField,
39
46
  NodeItem,
40
47
  OrderedList,
41
48
  PageItem,
@@ -43,9 +50,11 @@ from .document import (
43
50
  PictureChartData,
44
51
  PictureClassificationClass,
45
52
  PictureClassificationData,
53
+ PictureClassificationMetaField,
46
54
  PictureDataType,
47
55
  PictureItem,
48
56
  PictureLineChartData,
57
+ PictureMeta,
49
58
  PictureMoleculeData,
50
59
  PicturePieChartData,
51
60
  PictureScatterChartData,
@@ -56,9 +65,11 @@ from .document import (
56
65
  RichTableCell,
57
66
  Script,
58
67
  SectionHeaderItem,
68
+ SummaryMetaField,
59
69
  TableCell,
60
70
  TableData,
61
71
  TableItem,
72
+ TabularChartMetaField,
62
73
  TextItem,
63
74
  TitleItem,
64
75
  UnorderedList,