docling-core 2.21.2__tar.gz → 2.23.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (68) hide show
  1. {docling_core-2.21.2 → docling_core-2.23.0}/PKG-INFO +1 -1
  2. docling_core-2.23.0/docling_core/experimental/__init__.py +6 -0
  3. docling_core-2.23.0/docling_core/experimental/serializer/__init__.py +6 -0
  4. docling_core-2.23.0/docling_core/experimental/serializer/base.py +227 -0
  5. docling_core-2.23.0/docling_core/experimental/serializer/common.py +353 -0
  6. docling_core-2.23.0/docling_core/experimental/serializer/markdown.py +461 -0
  7. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/doc/document.py +779 -330
  8. docling_core-2.23.0/docling_core/types/doc/page.py +1238 -0
  9. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/doc/tokens.py +1 -0
  10. {docling_core-2.21.2 → docling_core-2.23.0}/pyproject.toml +2 -1
  11. {docling_core-2.21.2 → docling_core-2.23.0}/LICENSE +0 -0
  12. {docling_core-2.21.2 → docling_core-2.23.0}/README.md +0 -0
  13. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/__init__.py +0 -0
  14. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/cli/__init__.py +0 -0
  15. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/cli/view.py +0 -0
  16. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/py.typed +0 -0
  17. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  18. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  19. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  20. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  21. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  22. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  23. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  24. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  25. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/search/__init__.py +0 -0
  26. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  27. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/search/mapping.py +0 -0
  28. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/search/meta.py +0 -0
  29. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/search/package.py +0 -0
  30. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/transforms/__init__.py +0 -0
  31. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/transforms/chunker/__init__.py +0 -0
  32. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/transforms/chunker/base.py +0 -0
  33. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  34. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  35. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/__init__.py +0 -0
  36. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/base.py +0 -0
  37. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/doc/__init__.py +0 -0
  38. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/doc/base.py +0 -0
  39. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/doc/labels.py +0 -0
  40. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/doc/utils.py +0 -0
  41. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/gen/__init__.py +0 -0
  42. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/gen/generic.py +0 -0
  43. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/io/__init__.py +0 -0
  44. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  45. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/legacy_doc/base.py +0 -0
  46. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  47. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  48. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  49. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/legacy_doc/document.py +0 -0
  50. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  51. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/nlp/__init__.py +0 -0
  52. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/nlp/qa.py +0 -0
  53. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/nlp/qa_labels.py +0 -0
  54. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/rec/__init__.py +0 -0
  55. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/rec/attribute.py +0 -0
  56. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/rec/base.py +0 -0
  57. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/rec/predicate.py +0 -0
  58. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/rec/record.py +0 -0
  59. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/rec/statement.py +0 -0
  60. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/types/rec/subject.py +0 -0
  61. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/__init__.py +0 -0
  62. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/alias.py +0 -0
  63. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/file.py +0 -0
  64. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/generate_docs.py +0 -0
  65. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/generate_jsonschema.py +0 -0
  66. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/legacy.py +0 -0
  67. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/validate.py +0 -0
  68. {docling_core-2.21.2 → docling_core-2.23.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.21.2
3
+ Version: 2.23.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2025
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Experimental features."""
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the serializer types."""
@@ -0,0 +1,227 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2025
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define base classes for serialization."""
7
+ from abc import ABC, abstractmethod
8
+ from pathlib import Path
9
+ from typing import Optional, Union
10
+
11
+ from pydantic import AnyUrl, BaseModel
12
+
13
+ from docling_core.types.doc.document import (
14
+ DoclingDocument,
15
+ FloatingItem,
16
+ FormItem,
17
+ InlineGroup,
18
+ KeyValueItem,
19
+ NodeItem,
20
+ OrderedList,
21
+ PictureItem,
22
+ TableItem,
23
+ TextItem,
24
+ UnorderedList,
25
+ )
26
+
27
+
28
+ class SerializationResult(BaseModel):
29
+ """SerializationResult."""
30
+
31
+ text: str
32
+
33
+
34
+ class BaseTextSerializer(ABC):
35
+ """Base class for text item serializers."""
36
+
37
+ @abstractmethod
38
+ def serialize(
39
+ self,
40
+ *,
41
+ item: TextItem,
42
+ doc_serializer: "BaseDocSerializer",
43
+ doc: DoclingDocument,
44
+ **kwargs,
45
+ ) -> SerializationResult:
46
+ """Serializes the passed item."""
47
+ ...
48
+
49
+
50
+ class BaseTableSerializer(ABC):
51
+ """Base class for table item serializers."""
52
+
53
+ @abstractmethod
54
+ def serialize(
55
+ self,
56
+ *,
57
+ item: TableItem,
58
+ doc_serializer: "BaseDocSerializer",
59
+ doc: DoclingDocument,
60
+ **kwargs,
61
+ ) -> SerializationResult:
62
+ """Serializes the passed item."""
63
+ ...
64
+
65
+
66
+ class BasePictureSerializer(ABC):
67
+ """Base class for picture item serializers."""
68
+
69
+ @abstractmethod
70
+ def serialize(
71
+ self,
72
+ *,
73
+ item: PictureItem,
74
+ doc_serializer: "BaseDocSerializer",
75
+ doc: DoclingDocument,
76
+ **kwargs,
77
+ ) -> SerializationResult:
78
+ """Serializes the passed item."""
79
+ ...
80
+
81
+
82
+ class BaseKeyValueSerializer(ABC):
83
+ """Base class for key value item serializers."""
84
+
85
+ @abstractmethod
86
+ def serialize(
87
+ self,
88
+ *,
89
+ item: KeyValueItem,
90
+ doc_serializer: "BaseDocSerializer",
91
+ doc: DoclingDocument,
92
+ **kwargs,
93
+ ) -> SerializationResult:
94
+ """Serializes the passed item."""
95
+ ...
96
+
97
+
98
+ class BaseFormSerializer(ABC):
99
+ """Base class for form item serializers."""
100
+
101
+ @abstractmethod
102
+ def serialize(
103
+ self,
104
+ *,
105
+ item: FormItem,
106
+ doc_serializer: "BaseDocSerializer",
107
+ doc: DoclingDocument,
108
+ **kwargs,
109
+ ) -> SerializationResult:
110
+ """Serializes the passed item."""
111
+ ...
112
+
113
+
114
+ class BaseListSerializer(ABC):
115
+ """Base class for list serializers."""
116
+
117
+ @abstractmethod
118
+ def serialize(
119
+ self,
120
+ *,
121
+ item: Union[UnorderedList, OrderedList],
122
+ doc_serializer: "BaseDocSerializer",
123
+ doc: DoclingDocument,
124
+ **kwargs,
125
+ ) -> SerializationResult:
126
+ """Serializes the passed item."""
127
+ ...
128
+
129
+
130
+ class BaseInlineSerializer(ABC):
131
+ """Base class for inline serializers."""
132
+
133
+ @abstractmethod
134
+ def serialize(
135
+ self,
136
+ *,
137
+ item: InlineGroup,
138
+ doc_serializer: "BaseDocSerializer",
139
+ doc: DoclingDocument,
140
+ **kwargs,
141
+ ) -> SerializationResult:
142
+ """Serializes the passed item."""
143
+ ...
144
+
145
+
146
+ class BaseFallbackSerializer(ABC):
147
+ """Base fallback class for item serializers."""
148
+
149
+ @abstractmethod
150
+ def serialize(
151
+ self,
152
+ *,
153
+ item: NodeItem,
154
+ doc_serializer: "BaseDocSerializer",
155
+ doc: DoclingDocument,
156
+ **kwargs,
157
+ ) -> SerializationResult:
158
+ """Serializes the passed item."""
159
+ ...
160
+
161
+
162
+ class BaseDocSerializer(ABC):
163
+ """Base class for document serializers."""
164
+
165
+ @abstractmethod
166
+ def serialize(self, **kwargs) -> SerializationResult:
167
+ """Run the serialization."""
168
+ ...
169
+
170
+ @abstractmethod
171
+ def serialize_bold(self, text: str, **kwargs) -> str:
172
+ """Hook for bold formatting serialization."""
173
+ ...
174
+
175
+ @abstractmethod
176
+ def serialize_italic(self, text: str, **kwargs) -> str:
177
+ """Hook for italic formatting serialization."""
178
+ ...
179
+
180
+ @abstractmethod
181
+ def serialize_underline(self, text: str, **kwargs) -> str:
182
+ """Hook for underline formatting serialization."""
183
+ ...
184
+
185
+ @abstractmethod
186
+ def serialize_strikethrough(self, text: str, **kwargs) -> str:
187
+ """Hook for strikethrough formatting serialization."""
188
+ ...
189
+
190
+ @abstractmethod
191
+ def serialize_hyperlink(
192
+ self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
193
+ ) -> str:
194
+ """Hook for hyperlink serialization."""
195
+ ...
196
+
197
+ @abstractmethod
198
+ def get_parts(
199
+ self,
200
+ node: Optional[NodeItem] = None,
201
+ **kwargs,
202
+ ) -> list[SerializationResult]:
203
+ """Get the components to be combined for serializing this node."""
204
+ ...
205
+
206
+ @abstractmethod
207
+ def post_process(
208
+ self,
209
+ text: str,
210
+ **kwargs,
211
+ ) -> str:
212
+ """Apply some text post-processing steps."""
213
+ ...
214
+
215
+ @abstractmethod
216
+ def serialize_captions(
217
+ self,
218
+ item: FloatingItem,
219
+ **kwargs,
220
+ ) -> SerializationResult:
221
+ """Serialize the item's captions."""
222
+ ...
223
+
224
+ @abstractmethod
225
+ def get_excluded_refs(self) -> list[str]:
226
+ """Get references to excluded items."""
227
+ ...
@@ -0,0 +1,353 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2025
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define base classes for serialization."""
7
+ import sys
8
+ from functools import cached_property
9
+ from pathlib import Path
10
+ from typing import Optional, Union
11
+
12
+ from pydantic import AnyUrl, BaseModel, computed_field
13
+ from typing_extensions import override
14
+
15
+ from docling_core.experimental.serializer.base import (
16
+ BaseDocSerializer,
17
+ BaseFallbackSerializer,
18
+ BaseFormSerializer,
19
+ BaseInlineSerializer,
20
+ BaseKeyValueSerializer,
21
+ BaseListSerializer,
22
+ BasePictureSerializer,
23
+ BaseTableSerializer,
24
+ BaseTextSerializer,
25
+ SerializationResult,
26
+ )
27
+ from docling_core.types.doc.base import ImageRefMode
28
+ from docling_core.types.doc.document import (
29
+ DEFAULT_CONTENT_LAYERS,
30
+ DOCUMENT_TOKENS_EXPORT_LABELS,
31
+ ContentLayer,
32
+ DocItem,
33
+ DoclingDocument,
34
+ FloatingItem,
35
+ Formatting,
36
+ FormItem,
37
+ InlineGroup,
38
+ KeyValueItem,
39
+ NodeItem,
40
+ OrderedList,
41
+ PictureClassificationData,
42
+ PictureDescriptionData,
43
+ PictureItem,
44
+ PictureMoleculeData,
45
+ TableItem,
46
+ TextItem,
47
+ UnorderedList,
48
+ )
49
+ from docling_core.types.doc.labels import DocItemLabel
50
+
51
+ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
52
+
53
+
54
+ class DocSerializer(BaseModel, BaseDocSerializer):
55
+ """Class for document serializers."""
56
+
57
+ class Config:
58
+ """Pydantic config."""
59
+
60
+ arbitrary_types_allowed = True
61
+
62
+ doc: DoclingDocument
63
+
64
+ include_formatting: bool = True
65
+ include_hyperlinks: bool = True
66
+ escape_underscores: bool = True
67
+
68
+ # this filtering criteria are non-recursive;
69
+ # e.g. if a list group node is outside the range and some of its children items are
70
+ # within, they will be serialized
71
+ start: int = 0
72
+ stop: int = sys.maxsize
73
+ labels: set[DocItemLabel] = _DEFAULT_LABELS
74
+ layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS
75
+ pages: Optional[set[int]] = None
76
+
77
+ text_serializer: BaseTextSerializer
78
+ table_serializer: BaseTableSerializer
79
+ picture_serializer: BasePictureSerializer
80
+ key_value_serializer: BaseKeyValueSerializer
81
+ form_serializer: BaseFormSerializer
82
+ fallback_serializer: BaseFallbackSerializer
83
+
84
+ list_serializer: BaseListSerializer
85
+ inline_serializer: BaseInlineSerializer
86
+
87
+ # these will be passed to the picture serializer (None defers/delegates fallback
88
+ # setting to callee):
89
+ image_placeholder: Optional[str] = None
90
+ image_mode: Optional[ImageRefMode] = None
91
+
92
+ @computed_field # type: ignore[misc]
93
+ @cached_property
94
+ def _excluded_refs(self) -> list[str]:
95
+ refs: list[str] = [
96
+ item.self_ref
97
+ for ix, (item, _) in enumerate(
98
+ self.doc.iterate_items(
99
+ with_groups=True,
100
+ traverse_pictures=True,
101
+ )
102
+ )
103
+ if (
104
+ (ix < self.start or ix >= self.stop)
105
+ or (
106
+ isinstance(item, DocItem)
107
+ and (
108
+ item.label not in self.labels
109
+ or item.content_layer not in self.layers
110
+ or (
111
+ self.pages is not None
112
+ and (
113
+ (not item.prov)
114
+ or item.prov[0].page_no not in self.pages
115
+ )
116
+ )
117
+ )
118
+ )
119
+ )
120
+ ]
121
+ return refs
122
+
123
+ @override
124
+ def get_excluded_refs(self) -> list[str]:
125
+ """References to excluded items."""
126
+ return self._excluded_refs
127
+
128
+ # making some assumptions about the kwargs it can pass
129
+ @override
130
+ def get_parts(
131
+ self,
132
+ node: Optional[NodeItem] = None,
133
+ *,
134
+ traverse_pictures: bool = False,
135
+ list_level: int = 0,
136
+ is_inline_scope: bool = False,
137
+ visited: Optional[set[str]] = None, # refs of visited items
138
+ **kwargs,
139
+ ) -> list[SerializationResult]:
140
+ """Get the components to be combined for serializing this node."""
141
+ my_visited: set[str] = visited if visited is not None else set()
142
+ parts: list[SerializationResult] = []
143
+
144
+ label_blocklist = {
145
+ DocItemLabel.CAPTION,
146
+ DocItemLabel.FOOTNOTE,
147
+ # TODO handle differently as it clashes with self.labels
148
+ }
149
+ for ix, (item, _) in enumerate(
150
+ self.doc.iterate_items(
151
+ root=node,
152
+ with_groups=True,
153
+ traverse_pictures=traverse_pictures,
154
+ # ...
155
+ )
156
+ ):
157
+ if item.self_ref in my_visited:
158
+ continue
159
+ else:
160
+ my_visited.add(item.self_ref)
161
+
162
+ ########
163
+ # groups
164
+ ########
165
+ if isinstance(item, (UnorderedList, OrderedList)):
166
+ part = self.list_serializer.serialize(
167
+ item=item,
168
+ doc_serializer=self,
169
+ doc=self.doc,
170
+ list_level=list_level,
171
+ is_inline_scope=is_inline_scope,
172
+ visited=my_visited,
173
+ )
174
+ elif isinstance(item, InlineGroup):
175
+ part = self.inline_serializer.serialize(
176
+ item=item,
177
+ doc_serializer=self,
178
+ doc=self.doc,
179
+ list_level=list_level,
180
+ visited=my_visited,
181
+ )
182
+ ###########
183
+ # doc items
184
+ ###########
185
+ elif isinstance(item, DocItem) and item.label in label_blocklist:
186
+ continue
187
+ elif isinstance(item, TextItem):
188
+ part = (
189
+ self.text_serializer.serialize(
190
+ item=item,
191
+ doc_serializer=self,
192
+ doc=self.doc,
193
+ is_inline_scope=is_inline_scope,
194
+ )
195
+ if item.self_ref not in self.get_excluded_refs()
196
+ else SerializationResult(text="")
197
+ )
198
+ elif isinstance(item, TableItem):
199
+ part = self.table_serializer.serialize(
200
+ item=item,
201
+ doc_serializer=self,
202
+ doc=self.doc,
203
+ )
204
+ elif isinstance(item, PictureItem):
205
+ part = self.picture_serializer.serialize(
206
+ item=item,
207
+ doc_serializer=self,
208
+ doc=self.doc,
209
+ visited=my_visited,
210
+ image_mode=self.image_mode,
211
+ image_placeholder=self.image_placeholder,
212
+ )
213
+ elif isinstance(item, KeyValueItem):
214
+ part = self.key_value_serializer.serialize(
215
+ item=item,
216
+ doc_serializer=self,
217
+ doc=self.doc,
218
+ )
219
+ elif isinstance(item, FormItem):
220
+ part = self.form_serializer.serialize(
221
+ item=item,
222
+ doc_serializer=self,
223
+ doc=self.doc,
224
+ )
225
+ else:
226
+ part = self.fallback_serializer.serialize(
227
+ item=item,
228
+ doc_serializer=self,
229
+ doc=self.doc,
230
+ )
231
+ if part.text:
232
+ parts.append(part)
233
+ return parts
234
+
235
+ @override
236
+ def post_process(
237
+ self,
238
+ text: str,
239
+ *,
240
+ formatting: Optional[Formatting] = None,
241
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
242
+ **kwargs,
243
+ ) -> str:
244
+ """Apply some text post-processing steps."""
245
+ res = text
246
+ if self.include_formatting and formatting:
247
+ if formatting.bold:
248
+ res = self.serialize_bold(text=res)
249
+ if formatting.italic:
250
+ res = self.serialize_italic(text=res)
251
+ if formatting.underline:
252
+ res = self.serialize_underline(text=res)
253
+ if formatting.strikethrough:
254
+ res = self.serialize_strikethrough(text=res)
255
+ if self.include_hyperlinks and hyperlink:
256
+ res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
257
+ return res
258
+
259
+ @override
260
+ def serialize_bold(self, text: str, **kwargs) -> str:
261
+ """Hook for bold formatting serialization."""
262
+ return text
263
+
264
+ @override
265
+ def serialize_italic(self, text: str, **kwargs) -> str:
266
+ """Hook for italic formatting serialization."""
267
+ return text
268
+
269
+ @override
270
+ def serialize_underline(self, text: str, **kwargs) -> str:
271
+ """Hook for underline formatting serialization."""
272
+ return text
273
+
274
+ @override
275
+ def serialize_strikethrough(self, text: str, **kwargs) -> str:
276
+ """Hook for strikethrough formatting serialization."""
277
+ return text
278
+
279
+ @override
280
+ def serialize_hyperlink(
281
+ self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
282
+ ) -> str:
283
+ """Hook for hyperlink serialization."""
284
+ return text
285
+
286
+ @override
287
+ def serialize_captions(
288
+ self,
289
+ item: FloatingItem,
290
+ separator: Optional[str] = None,
291
+ **kwargs,
292
+ ) -> SerializationResult:
293
+ """Serialize the item's captions."""
294
+ text_parts: list[str] = [
295
+ it.text
296
+ for cap in item.captions
297
+ if isinstance(it := cap.resolve(self.doc), TextItem)
298
+ and it.self_ref not in self.get_excluded_refs()
299
+ ]
300
+ text_res = (separator or "\n").join(text_parts)
301
+ text_res = self.post_process(text=text_res)
302
+ return SerializationResult(text=text_res)
303
+
304
+
305
+ class PictureSerializer(BasePictureSerializer):
306
+ """Class for picture serializers."""
307
+
308
+ # helper function
309
+ def _serialize_content(
310
+ self,
311
+ item: PictureItem,
312
+ doc_serializer: "BaseDocSerializer",
313
+ doc: DoclingDocument,
314
+ separator: Optional[str] = None,
315
+ visited: Optional[set[str]] = None,
316
+ **kwargs,
317
+ ) -> SerializationResult:
318
+ parts = doc_serializer.get_parts(
319
+ node=item,
320
+ traverse_pictures=True,
321
+ visited=visited,
322
+ )
323
+ text_res = (separator or " ").join([p.text for p in parts])
324
+ # NOTE: we do no postprocessing since already done as needed
325
+ return SerializationResult(text=text_res)
326
+
327
+ # helper function
328
+ def _serialize_annotations(
329
+ self,
330
+ item: PictureItem,
331
+ doc_serializer: "BaseDocSerializer",
332
+ doc: DoclingDocument,
333
+ separator: Optional[str] = None,
334
+ **kwargs,
335
+ ) -> SerializationResult:
336
+ text_parts: list[str] = []
337
+ for annotation in item.annotations:
338
+ if isinstance(annotation, PictureClassificationData):
339
+ predicted_class = (
340
+ annotation.predicted_classes[0].class_name
341
+ if annotation.predicted_classes
342
+ else None
343
+ )
344
+ if predicted_class is not None:
345
+ text_parts.append(f"Picture type: {predicted_class}")
346
+ elif isinstance(annotation, PictureMoleculeData):
347
+ text_parts.append(f"SMILES: {annotation.smi}")
348
+ elif isinstance(annotation, PictureDescriptionData):
349
+ text_parts.append(f"Description: {annotation.text}")
350
+
351
+ text_res = (separator or "\n").join(text_parts)
352
+ text_res = doc_serializer.post_process(text=text_res)
353
+ return SerializationResult(text=text_res)