docling-core 2.22.0__py3-none-any.whl → 2.23.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/__init__.py +6 -0
- docling_core/experimental/serializer/__init__.py +6 -0
- docling_core/experimental/serializer/base.py +227 -0
- docling_core/experimental/serializer/common.py +353 -0
- docling_core/experimental/serializer/markdown.py +461 -0
- docling_core/types/doc/document.py +348 -328
- docling_core/types/doc/page.py +1238 -0
- {docling_core-2.22.0.dist-info → docling_core-2.23.1.dist-info}/METADATA +5 -5
- {docling_core-2.22.0.dist-info → docling_core-2.23.1.dist-info}/RECORD +12 -6
- {docling_core-2.22.0.dist-info → docling_core-2.23.1.dist-info}/LICENSE +0 -0
- {docling_core-2.22.0.dist-info → docling_core-2.23.1.dist-info}/WHEEL +0 -0
- {docling_core-2.22.0.dist-info → docling_core-2.23.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2025
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define base classes for serialization."""
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Union
|
|
10
|
+
|
|
11
|
+
from pydantic import AnyUrl, BaseModel
|
|
12
|
+
|
|
13
|
+
from docling_core.types.doc.document import (
|
|
14
|
+
DoclingDocument,
|
|
15
|
+
FloatingItem,
|
|
16
|
+
FormItem,
|
|
17
|
+
InlineGroup,
|
|
18
|
+
KeyValueItem,
|
|
19
|
+
NodeItem,
|
|
20
|
+
OrderedList,
|
|
21
|
+
PictureItem,
|
|
22
|
+
TableItem,
|
|
23
|
+
TextItem,
|
|
24
|
+
UnorderedList,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SerializationResult(BaseModel):
|
|
29
|
+
"""SerializationResult."""
|
|
30
|
+
|
|
31
|
+
text: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class BaseTextSerializer(ABC):
|
|
35
|
+
"""Base class for text item serializers."""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def serialize(
|
|
39
|
+
self,
|
|
40
|
+
*,
|
|
41
|
+
item: TextItem,
|
|
42
|
+
doc_serializer: "BaseDocSerializer",
|
|
43
|
+
doc: DoclingDocument,
|
|
44
|
+
**kwargs,
|
|
45
|
+
) -> SerializationResult:
|
|
46
|
+
"""Serializes the passed item."""
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class BaseTableSerializer(ABC):
|
|
51
|
+
"""Base class for table item serializers."""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def serialize(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
item: TableItem,
|
|
58
|
+
doc_serializer: "BaseDocSerializer",
|
|
59
|
+
doc: DoclingDocument,
|
|
60
|
+
**kwargs,
|
|
61
|
+
) -> SerializationResult:
|
|
62
|
+
"""Serializes the passed item."""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class BasePictureSerializer(ABC):
|
|
67
|
+
"""Base class for picture item serializers."""
|
|
68
|
+
|
|
69
|
+
@abstractmethod
|
|
70
|
+
def serialize(
|
|
71
|
+
self,
|
|
72
|
+
*,
|
|
73
|
+
item: PictureItem,
|
|
74
|
+
doc_serializer: "BaseDocSerializer",
|
|
75
|
+
doc: DoclingDocument,
|
|
76
|
+
**kwargs,
|
|
77
|
+
) -> SerializationResult:
|
|
78
|
+
"""Serializes the passed item."""
|
|
79
|
+
...
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class BaseKeyValueSerializer(ABC):
|
|
83
|
+
"""Base class for key value item serializers."""
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def serialize(
|
|
87
|
+
self,
|
|
88
|
+
*,
|
|
89
|
+
item: KeyValueItem,
|
|
90
|
+
doc_serializer: "BaseDocSerializer",
|
|
91
|
+
doc: DoclingDocument,
|
|
92
|
+
**kwargs,
|
|
93
|
+
) -> SerializationResult:
|
|
94
|
+
"""Serializes the passed item."""
|
|
95
|
+
...
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class BaseFormSerializer(ABC):
|
|
99
|
+
"""Base class for form item serializers."""
|
|
100
|
+
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def serialize(
|
|
103
|
+
self,
|
|
104
|
+
*,
|
|
105
|
+
item: FormItem,
|
|
106
|
+
doc_serializer: "BaseDocSerializer",
|
|
107
|
+
doc: DoclingDocument,
|
|
108
|
+
**kwargs,
|
|
109
|
+
) -> SerializationResult:
|
|
110
|
+
"""Serializes the passed item."""
|
|
111
|
+
...
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class BaseListSerializer(ABC):
|
|
115
|
+
"""Base class for list serializers."""
|
|
116
|
+
|
|
117
|
+
@abstractmethod
|
|
118
|
+
def serialize(
|
|
119
|
+
self,
|
|
120
|
+
*,
|
|
121
|
+
item: Union[UnorderedList, OrderedList],
|
|
122
|
+
doc_serializer: "BaseDocSerializer",
|
|
123
|
+
doc: DoclingDocument,
|
|
124
|
+
**kwargs,
|
|
125
|
+
) -> SerializationResult:
|
|
126
|
+
"""Serializes the passed item."""
|
|
127
|
+
...
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class BaseInlineSerializer(ABC):
|
|
131
|
+
"""Base class for inline serializers."""
|
|
132
|
+
|
|
133
|
+
@abstractmethod
|
|
134
|
+
def serialize(
|
|
135
|
+
self,
|
|
136
|
+
*,
|
|
137
|
+
item: InlineGroup,
|
|
138
|
+
doc_serializer: "BaseDocSerializer",
|
|
139
|
+
doc: DoclingDocument,
|
|
140
|
+
**kwargs,
|
|
141
|
+
) -> SerializationResult:
|
|
142
|
+
"""Serializes the passed item."""
|
|
143
|
+
...
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class BaseFallbackSerializer(ABC):
|
|
147
|
+
"""Base fallback class for item serializers."""
|
|
148
|
+
|
|
149
|
+
@abstractmethod
|
|
150
|
+
def serialize(
|
|
151
|
+
self,
|
|
152
|
+
*,
|
|
153
|
+
item: NodeItem,
|
|
154
|
+
doc_serializer: "BaseDocSerializer",
|
|
155
|
+
doc: DoclingDocument,
|
|
156
|
+
**kwargs,
|
|
157
|
+
) -> SerializationResult:
|
|
158
|
+
"""Serializes the passed item."""
|
|
159
|
+
...
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class BaseDocSerializer(ABC):
|
|
163
|
+
"""Base class for document serializers."""
|
|
164
|
+
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def serialize(self, **kwargs) -> SerializationResult:
|
|
167
|
+
"""Run the serialization."""
|
|
168
|
+
...
|
|
169
|
+
|
|
170
|
+
@abstractmethod
|
|
171
|
+
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
172
|
+
"""Hook for bold formatting serialization."""
|
|
173
|
+
...
|
|
174
|
+
|
|
175
|
+
@abstractmethod
|
|
176
|
+
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
177
|
+
"""Hook for italic formatting serialization."""
|
|
178
|
+
...
|
|
179
|
+
|
|
180
|
+
@abstractmethod
|
|
181
|
+
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
182
|
+
"""Hook for underline formatting serialization."""
|
|
183
|
+
...
|
|
184
|
+
|
|
185
|
+
@abstractmethod
|
|
186
|
+
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
187
|
+
"""Hook for strikethrough formatting serialization."""
|
|
188
|
+
...
|
|
189
|
+
|
|
190
|
+
@abstractmethod
|
|
191
|
+
def serialize_hyperlink(
|
|
192
|
+
self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
|
|
193
|
+
) -> str:
|
|
194
|
+
"""Hook for hyperlink serialization."""
|
|
195
|
+
...
|
|
196
|
+
|
|
197
|
+
@abstractmethod
|
|
198
|
+
def get_parts(
|
|
199
|
+
self,
|
|
200
|
+
node: Optional[NodeItem] = None,
|
|
201
|
+
**kwargs,
|
|
202
|
+
) -> list[SerializationResult]:
|
|
203
|
+
"""Get the components to be combined for serializing this node."""
|
|
204
|
+
...
|
|
205
|
+
|
|
206
|
+
@abstractmethod
|
|
207
|
+
def post_process(
|
|
208
|
+
self,
|
|
209
|
+
text: str,
|
|
210
|
+
**kwargs,
|
|
211
|
+
) -> str:
|
|
212
|
+
"""Apply some text post-processing steps."""
|
|
213
|
+
...
|
|
214
|
+
|
|
215
|
+
@abstractmethod
|
|
216
|
+
def serialize_captions(
|
|
217
|
+
self,
|
|
218
|
+
item: FloatingItem,
|
|
219
|
+
**kwargs,
|
|
220
|
+
) -> SerializationResult:
|
|
221
|
+
"""Serialize the item's captions."""
|
|
222
|
+
...
|
|
223
|
+
|
|
224
|
+
@abstractmethod
|
|
225
|
+
def get_excluded_refs(self) -> list[str]:
|
|
226
|
+
"""Get references to excluded items."""
|
|
227
|
+
...
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2025
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define base classes for serialization."""
|
|
7
|
+
import sys
|
|
8
|
+
from functools import cached_property
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
|
|
12
|
+
from pydantic import AnyUrl, BaseModel, computed_field
|
|
13
|
+
from typing_extensions import override
|
|
14
|
+
|
|
15
|
+
from docling_core.experimental.serializer.base import (
|
|
16
|
+
BaseDocSerializer,
|
|
17
|
+
BaseFallbackSerializer,
|
|
18
|
+
BaseFormSerializer,
|
|
19
|
+
BaseInlineSerializer,
|
|
20
|
+
BaseKeyValueSerializer,
|
|
21
|
+
BaseListSerializer,
|
|
22
|
+
BasePictureSerializer,
|
|
23
|
+
BaseTableSerializer,
|
|
24
|
+
BaseTextSerializer,
|
|
25
|
+
SerializationResult,
|
|
26
|
+
)
|
|
27
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
28
|
+
from docling_core.types.doc.document import (
|
|
29
|
+
DEFAULT_CONTENT_LAYERS,
|
|
30
|
+
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
31
|
+
ContentLayer,
|
|
32
|
+
DocItem,
|
|
33
|
+
DoclingDocument,
|
|
34
|
+
FloatingItem,
|
|
35
|
+
Formatting,
|
|
36
|
+
FormItem,
|
|
37
|
+
InlineGroup,
|
|
38
|
+
KeyValueItem,
|
|
39
|
+
NodeItem,
|
|
40
|
+
OrderedList,
|
|
41
|
+
PictureClassificationData,
|
|
42
|
+
PictureDescriptionData,
|
|
43
|
+
PictureItem,
|
|
44
|
+
PictureMoleculeData,
|
|
45
|
+
TableItem,
|
|
46
|
+
TextItem,
|
|
47
|
+
UnorderedList,
|
|
48
|
+
)
|
|
49
|
+
from docling_core.types.doc.labels import DocItemLabel
|
|
50
|
+
|
|
51
|
+
_DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class DocSerializer(BaseModel, BaseDocSerializer):
|
|
55
|
+
"""Class for document serializers."""
|
|
56
|
+
|
|
57
|
+
class Config:
|
|
58
|
+
"""Pydantic config."""
|
|
59
|
+
|
|
60
|
+
arbitrary_types_allowed = True
|
|
61
|
+
|
|
62
|
+
doc: DoclingDocument
|
|
63
|
+
|
|
64
|
+
include_formatting: bool = True
|
|
65
|
+
include_hyperlinks: bool = True
|
|
66
|
+
escape_underscores: bool = True
|
|
67
|
+
|
|
68
|
+
# this filtering criteria are non-recursive;
|
|
69
|
+
# e.g. if a list group node is outside the range and some of its children items are
|
|
70
|
+
# within, they will be serialized
|
|
71
|
+
start: int = 0
|
|
72
|
+
stop: int = sys.maxsize
|
|
73
|
+
labels: set[DocItemLabel] = _DEFAULT_LABELS
|
|
74
|
+
layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS
|
|
75
|
+
pages: Optional[set[int]] = None
|
|
76
|
+
|
|
77
|
+
text_serializer: BaseTextSerializer
|
|
78
|
+
table_serializer: BaseTableSerializer
|
|
79
|
+
picture_serializer: BasePictureSerializer
|
|
80
|
+
key_value_serializer: BaseKeyValueSerializer
|
|
81
|
+
form_serializer: BaseFormSerializer
|
|
82
|
+
fallback_serializer: BaseFallbackSerializer
|
|
83
|
+
|
|
84
|
+
list_serializer: BaseListSerializer
|
|
85
|
+
inline_serializer: BaseInlineSerializer
|
|
86
|
+
|
|
87
|
+
# these will be passed to the picture serializer (None defers/delegates fallback
|
|
88
|
+
# setting to callee):
|
|
89
|
+
image_placeholder: Optional[str] = None
|
|
90
|
+
image_mode: Optional[ImageRefMode] = None
|
|
91
|
+
|
|
92
|
+
@computed_field # type: ignore[misc]
|
|
93
|
+
@cached_property
|
|
94
|
+
def _excluded_refs(self) -> list[str]:
|
|
95
|
+
refs: list[str] = [
|
|
96
|
+
item.self_ref
|
|
97
|
+
for ix, (item, _) in enumerate(
|
|
98
|
+
self.doc.iterate_items(
|
|
99
|
+
with_groups=True,
|
|
100
|
+
traverse_pictures=True,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
if (
|
|
104
|
+
(ix < self.start or ix >= self.stop)
|
|
105
|
+
or (
|
|
106
|
+
isinstance(item, DocItem)
|
|
107
|
+
and (
|
|
108
|
+
item.label not in self.labels
|
|
109
|
+
or item.content_layer not in self.layers
|
|
110
|
+
or (
|
|
111
|
+
self.pages is not None
|
|
112
|
+
and (
|
|
113
|
+
(not item.prov)
|
|
114
|
+
or item.prov[0].page_no not in self.pages
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
]
|
|
121
|
+
return refs
|
|
122
|
+
|
|
123
|
+
@override
|
|
124
|
+
def get_excluded_refs(self) -> list[str]:
|
|
125
|
+
"""References to excluded items."""
|
|
126
|
+
return self._excluded_refs
|
|
127
|
+
|
|
128
|
+
# making some assumptions about the kwargs it can pass
|
|
129
|
+
@override
|
|
130
|
+
def get_parts(
|
|
131
|
+
self,
|
|
132
|
+
node: Optional[NodeItem] = None,
|
|
133
|
+
*,
|
|
134
|
+
traverse_pictures: bool = False,
|
|
135
|
+
list_level: int = 0,
|
|
136
|
+
is_inline_scope: bool = False,
|
|
137
|
+
visited: Optional[set[str]] = None, # refs of visited items
|
|
138
|
+
**kwargs,
|
|
139
|
+
) -> list[SerializationResult]:
|
|
140
|
+
"""Get the components to be combined for serializing this node."""
|
|
141
|
+
my_visited: set[str] = visited if visited is not None else set()
|
|
142
|
+
parts: list[SerializationResult] = []
|
|
143
|
+
|
|
144
|
+
label_blocklist = {
|
|
145
|
+
DocItemLabel.CAPTION,
|
|
146
|
+
DocItemLabel.FOOTNOTE,
|
|
147
|
+
# TODO handle differently as it clashes with self.labels
|
|
148
|
+
}
|
|
149
|
+
for ix, (item, _) in enumerate(
|
|
150
|
+
self.doc.iterate_items(
|
|
151
|
+
root=node,
|
|
152
|
+
with_groups=True,
|
|
153
|
+
traverse_pictures=traverse_pictures,
|
|
154
|
+
# ...
|
|
155
|
+
)
|
|
156
|
+
):
|
|
157
|
+
if item.self_ref in my_visited:
|
|
158
|
+
continue
|
|
159
|
+
else:
|
|
160
|
+
my_visited.add(item.self_ref)
|
|
161
|
+
|
|
162
|
+
########
|
|
163
|
+
# groups
|
|
164
|
+
########
|
|
165
|
+
if isinstance(item, (UnorderedList, OrderedList)):
|
|
166
|
+
part = self.list_serializer.serialize(
|
|
167
|
+
item=item,
|
|
168
|
+
doc_serializer=self,
|
|
169
|
+
doc=self.doc,
|
|
170
|
+
list_level=list_level,
|
|
171
|
+
is_inline_scope=is_inline_scope,
|
|
172
|
+
visited=my_visited,
|
|
173
|
+
)
|
|
174
|
+
elif isinstance(item, InlineGroup):
|
|
175
|
+
part = self.inline_serializer.serialize(
|
|
176
|
+
item=item,
|
|
177
|
+
doc_serializer=self,
|
|
178
|
+
doc=self.doc,
|
|
179
|
+
list_level=list_level,
|
|
180
|
+
visited=my_visited,
|
|
181
|
+
)
|
|
182
|
+
###########
|
|
183
|
+
# doc items
|
|
184
|
+
###########
|
|
185
|
+
elif isinstance(item, DocItem) and item.label in label_blocklist:
|
|
186
|
+
continue
|
|
187
|
+
elif isinstance(item, TextItem):
|
|
188
|
+
part = (
|
|
189
|
+
self.text_serializer.serialize(
|
|
190
|
+
item=item,
|
|
191
|
+
doc_serializer=self,
|
|
192
|
+
doc=self.doc,
|
|
193
|
+
is_inline_scope=is_inline_scope,
|
|
194
|
+
)
|
|
195
|
+
if item.self_ref not in self.get_excluded_refs()
|
|
196
|
+
else SerializationResult(text="")
|
|
197
|
+
)
|
|
198
|
+
elif isinstance(item, TableItem):
|
|
199
|
+
part = self.table_serializer.serialize(
|
|
200
|
+
item=item,
|
|
201
|
+
doc_serializer=self,
|
|
202
|
+
doc=self.doc,
|
|
203
|
+
)
|
|
204
|
+
elif isinstance(item, PictureItem):
|
|
205
|
+
part = self.picture_serializer.serialize(
|
|
206
|
+
item=item,
|
|
207
|
+
doc_serializer=self,
|
|
208
|
+
doc=self.doc,
|
|
209
|
+
visited=my_visited,
|
|
210
|
+
image_mode=self.image_mode,
|
|
211
|
+
image_placeholder=self.image_placeholder,
|
|
212
|
+
)
|
|
213
|
+
elif isinstance(item, KeyValueItem):
|
|
214
|
+
part = self.key_value_serializer.serialize(
|
|
215
|
+
item=item,
|
|
216
|
+
doc_serializer=self,
|
|
217
|
+
doc=self.doc,
|
|
218
|
+
)
|
|
219
|
+
elif isinstance(item, FormItem):
|
|
220
|
+
part = self.form_serializer.serialize(
|
|
221
|
+
item=item,
|
|
222
|
+
doc_serializer=self,
|
|
223
|
+
doc=self.doc,
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
part = self.fallback_serializer.serialize(
|
|
227
|
+
item=item,
|
|
228
|
+
doc_serializer=self,
|
|
229
|
+
doc=self.doc,
|
|
230
|
+
)
|
|
231
|
+
if part.text:
|
|
232
|
+
parts.append(part)
|
|
233
|
+
return parts
|
|
234
|
+
|
|
235
|
+
@override
|
|
236
|
+
def post_process(
|
|
237
|
+
self,
|
|
238
|
+
text: str,
|
|
239
|
+
*,
|
|
240
|
+
formatting: Optional[Formatting] = None,
|
|
241
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
242
|
+
**kwargs,
|
|
243
|
+
) -> str:
|
|
244
|
+
"""Apply some text post-processing steps."""
|
|
245
|
+
res = text
|
|
246
|
+
if self.include_formatting and formatting:
|
|
247
|
+
if formatting.bold:
|
|
248
|
+
res = self.serialize_bold(text=res)
|
|
249
|
+
if formatting.italic:
|
|
250
|
+
res = self.serialize_italic(text=res)
|
|
251
|
+
if formatting.underline:
|
|
252
|
+
res = self.serialize_underline(text=res)
|
|
253
|
+
if formatting.strikethrough:
|
|
254
|
+
res = self.serialize_strikethrough(text=res)
|
|
255
|
+
if self.include_hyperlinks and hyperlink:
|
|
256
|
+
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
|
|
257
|
+
return res
|
|
258
|
+
|
|
259
|
+
@override
|
|
260
|
+
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
261
|
+
"""Hook for bold formatting serialization."""
|
|
262
|
+
return text
|
|
263
|
+
|
|
264
|
+
@override
|
|
265
|
+
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
266
|
+
"""Hook for italic formatting serialization."""
|
|
267
|
+
return text
|
|
268
|
+
|
|
269
|
+
@override
|
|
270
|
+
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
271
|
+
"""Hook for underline formatting serialization."""
|
|
272
|
+
return text
|
|
273
|
+
|
|
274
|
+
@override
|
|
275
|
+
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
276
|
+
"""Hook for strikethrough formatting serialization."""
|
|
277
|
+
return text
|
|
278
|
+
|
|
279
|
+
@override
|
|
280
|
+
def serialize_hyperlink(
|
|
281
|
+
self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
|
|
282
|
+
) -> str:
|
|
283
|
+
"""Hook for hyperlink serialization."""
|
|
284
|
+
return text
|
|
285
|
+
|
|
286
|
+
@override
|
|
287
|
+
def serialize_captions(
|
|
288
|
+
self,
|
|
289
|
+
item: FloatingItem,
|
|
290
|
+
separator: Optional[str] = None,
|
|
291
|
+
**kwargs,
|
|
292
|
+
) -> SerializationResult:
|
|
293
|
+
"""Serialize the item's captions."""
|
|
294
|
+
text_parts: list[str] = [
|
|
295
|
+
it.text
|
|
296
|
+
for cap in item.captions
|
|
297
|
+
if isinstance(it := cap.resolve(self.doc), TextItem)
|
|
298
|
+
and it.self_ref not in self.get_excluded_refs()
|
|
299
|
+
]
|
|
300
|
+
text_res = (separator or "\n").join(text_parts)
|
|
301
|
+
text_res = self.post_process(text=text_res)
|
|
302
|
+
return SerializationResult(text=text_res)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class PictureSerializer(BasePictureSerializer):
|
|
306
|
+
"""Class for picture serializers."""
|
|
307
|
+
|
|
308
|
+
# helper function
|
|
309
|
+
def _serialize_content(
|
|
310
|
+
self,
|
|
311
|
+
item: PictureItem,
|
|
312
|
+
doc_serializer: "BaseDocSerializer",
|
|
313
|
+
doc: DoclingDocument,
|
|
314
|
+
separator: Optional[str] = None,
|
|
315
|
+
visited: Optional[set[str]] = None,
|
|
316
|
+
**kwargs,
|
|
317
|
+
) -> SerializationResult:
|
|
318
|
+
parts = doc_serializer.get_parts(
|
|
319
|
+
node=item,
|
|
320
|
+
traverse_pictures=True,
|
|
321
|
+
visited=visited,
|
|
322
|
+
)
|
|
323
|
+
text_res = (separator or " ").join([p.text for p in parts])
|
|
324
|
+
# NOTE: we do no postprocessing since already done as needed
|
|
325
|
+
return SerializationResult(text=text_res)
|
|
326
|
+
|
|
327
|
+
# helper function
|
|
328
|
+
def _serialize_annotations(
|
|
329
|
+
self,
|
|
330
|
+
item: PictureItem,
|
|
331
|
+
doc_serializer: "BaseDocSerializer",
|
|
332
|
+
doc: DoclingDocument,
|
|
333
|
+
separator: Optional[str] = None,
|
|
334
|
+
**kwargs,
|
|
335
|
+
) -> SerializationResult:
|
|
336
|
+
text_parts: list[str] = []
|
|
337
|
+
for annotation in item.annotations:
|
|
338
|
+
if isinstance(annotation, PictureClassificationData):
|
|
339
|
+
predicted_class = (
|
|
340
|
+
annotation.predicted_classes[0].class_name
|
|
341
|
+
if annotation.predicted_classes
|
|
342
|
+
else None
|
|
343
|
+
)
|
|
344
|
+
if predicted_class is not None:
|
|
345
|
+
text_parts.append(f"Picture type: {predicted_class}")
|
|
346
|
+
elif isinstance(annotation, PictureMoleculeData):
|
|
347
|
+
text_parts.append(f"SMILES: {annotation.smi}")
|
|
348
|
+
elif isinstance(annotation, PictureDescriptionData):
|
|
349
|
+
text_parts.append(f"Description: {annotation.text}")
|
|
350
|
+
|
|
351
|
+
text_res = (separator or "\n").join(text_parts)
|
|
352
|
+
text_res = doc_serializer.post_process(text=text_res)
|
|
353
|
+
return SerializationResult(text=text_res)
|