docling-core 2.23.3__py3-none-any.whl → 2.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +2 -2
- docling_core/experimental/serializer/common.py +250 -196
- docling_core/experimental/serializer/doctags.py +492 -0
- docling_core/experimental/serializer/markdown.py +70 -41
- docling_core/types/doc/document.py +412 -418
- docling_core/types/doc/page.py +18 -6
- docling_core/types/doc/tokens.py +192 -26
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/METADATA +1 -1
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/RECORD +12 -11
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/LICENSE +0 -0
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/WHEEL +0 -0
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/entry_points.txt +0 -0
|
@@ -197,7 +197,7 @@ class BaseDocSerializer(ABC):
|
|
|
197
197
|
@abstractmethod
|
|
198
198
|
def get_parts(
|
|
199
199
|
self,
|
|
200
|
-
|
|
200
|
+
item: Optional[NodeItem] = None,
|
|
201
201
|
**kwargs,
|
|
202
202
|
) -> list[SerializationResult]:
|
|
203
203
|
"""Get the components to be combined for serializing this node."""
|
|
@@ -222,6 +222,6 @@ class BaseDocSerializer(ABC):
|
|
|
222
222
|
...
|
|
223
223
|
|
|
224
224
|
@abstractmethod
|
|
225
|
-
def get_excluded_refs(self) -> list[str]:
|
|
225
|
+
def get_excluded_refs(self, **kwargs) -> list[str]:
|
|
226
226
|
"""Get references to excluded items."""
|
|
227
227
|
...
|
|
@@ -5,12 +5,14 @@
|
|
|
5
5
|
|
|
6
6
|
"""Define base classes for serialization."""
|
|
7
7
|
import sys
|
|
8
|
+
from abc import abstractmethod
|
|
9
|
+
from copy import deepcopy
|
|
8
10
|
from functools import cached_property
|
|
9
11
|
from pathlib import Path
|
|
10
|
-
from typing import Optional, Union
|
|
12
|
+
from typing import Any, Optional, Union
|
|
11
13
|
|
|
12
|
-
from pydantic import AnyUrl, BaseModel, computed_field
|
|
13
|
-
from typing_extensions import override
|
|
14
|
+
from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
|
|
15
|
+
from typing_extensions import Self, override
|
|
14
16
|
|
|
15
17
|
from docling_core.experimental.serializer.base import (
|
|
16
18
|
BaseDocSerializer,
|
|
@@ -24,9 +26,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
24
26
|
BaseTextSerializer,
|
|
25
27
|
SerializationResult,
|
|
26
28
|
)
|
|
27
|
-
from docling_core.types.doc.base import ImageRefMode
|
|
28
29
|
from docling_core.types.doc.document import (
|
|
29
|
-
DEFAULT_CONTENT_LAYERS,
|
|
30
30
|
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
31
31
|
ContentLayer,
|
|
32
32
|
DocItem,
|
|
@@ -38,10 +38,7 @@ from docling_core.types.doc.document import (
|
|
|
38
38
|
KeyValueItem,
|
|
39
39
|
NodeItem,
|
|
40
40
|
OrderedList,
|
|
41
|
-
PictureClassificationData,
|
|
42
|
-
PictureDescriptionData,
|
|
43
41
|
PictureItem,
|
|
44
|
-
PictureMoleculeData,
|
|
45
42
|
TableItem,
|
|
46
43
|
TextItem,
|
|
47
44
|
UnorderedList,
|
|
@@ -49,6 +46,30 @@ from docling_core.types.doc.document import (
|
|
|
49
46
|
from docling_core.types.doc.labels import DocItemLabel
|
|
50
47
|
|
|
51
48
|
_DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
49
|
+
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class CommonParams(BaseModel):
|
|
53
|
+
"""Common serialization parameters."""
|
|
54
|
+
|
|
55
|
+
# allowlists with non-recursive semantics, i.e. if a list group node is outside the
|
|
56
|
+
# range and some of its children items are within, they will be serialized
|
|
57
|
+
labels: set[DocItemLabel] = _DEFAULT_LABELS
|
|
58
|
+
layers: set[ContentLayer] = _DEFAULT_LAYERS
|
|
59
|
+
pages: Optional[set[int]] = None # None means all pages are allowed
|
|
60
|
+
|
|
61
|
+
# slice-like semantics: start is included, stop is excluded
|
|
62
|
+
start_idx: NonNegativeInt = 0
|
|
63
|
+
stop_idx: NonNegativeInt = sys.maxsize
|
|
64
|
+
|
|
65
|
+
include_formatting: bool = True
|
|
66
|
+
include_hyperlinks: bool = True
|
|
67
|
+
caption_delim: str = " "
|
|
68
|
+
|
|
69
|
+
def merge_with_patch(self, patch: dict[str, Any]) -> Self:
|
|
70
|
+
"""Create an instance by merging the provided patch dict on top of self."""
|
|
71
|
+
res = self.model_validate({**self.model_dump(), **patch})
|
|
72
|
+
return res
|
|
52
73
|
|
|
53
74
|
|
|
54
75
|
class DocSerializer(BaseModel, BaseDocSerializer):
|
|
@@ -58,22 +79,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
58
79
|
"""Pydantic config."""
|
|
59
80
|
|
|
60
81
|
arbitrary_types_allowed = True
|
|
82
|
+
extra = "forbid"
|
|
61
83
|
|
|
62
84
|
doc: DoclingDocument
|
|
63
85
|
|
|
64
|
-
include_formatting: bool = True
|
|
65
|
-
include_hyperlinks: bool = True
|
|
66
|
-
escape_underscores: bool = True
|
|
67
|
-
|
|
68
|
-
# this filtering criteria are non-recursive;
|
|
69
|
-
# e.g. if a list group node is outside the range and some of its children items are
|
|
70
|
-
# within, they will be serialized
|
|
71
|
-
start: int = 0
|
|
72
|
-
stop: int = sys.maxsize
|
|
73
|
-
labels: set[DocItemLabel] = _DEFAULT_LABELS
|
|
74
|
-
layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS
|
|
75
|
-
pages: Optional[set[int]] = None
|
|
76
|
-
|
|
77
86
|
text_serializer: BaseTextSerializer
|
|
78
87
|
table_serializer: BaseTableSerializer
|
|
79
88
|
picture_serializer: BasePictureSerializer
|
|
@@ -84,150 +93,242 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
84
93
|
list_serializer: BaseListSerializer
|
|
85
94
|
inline_serializer: BaseInlineSerializer
|
|
86
95
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
image_mode: Optional[ImageRefMode] = None
|
|
96
|
+
params: CommonParams = CommonParams()
|
|
97
|
+
|
|
98
|
+
_excluded_refs_cache: dict[str, list[str]] = {}
|
|
91
99
|
|
|
92
100
|
@computed_field # type: ignore[misc]
|
|
93
101
|
@cached_property
|
|
94
|
-
def
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
+
def _captions_of_some_item(self) -> set[str]:
|
|
103
|
+
layers = {cl for cl in ContentLayer} # TODO review
|
|
104
|
+
refs = {
|
|
105
|
+
cap.cref
|
|
106
|
+
for (item, _) in self.doc.iterate_items(
|
|
107
|
+
with_groups=True,
|
|
108
|
+
traverse_pictures=True,
|
|
109
|
+
included_content_layers=layers,
|
|
102
110
|
)
|
|
103
|
-
if (
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
111
|
+
for cap in (item.captions if isinstance(item, FloatingItem) else [])
|
|
112
|
+
}
|
|
113
|
+
return refs
|
|
114
|
+
|
|
115
|
+
@override
|
|
116
|
+
def get_excluded_refs(self, **kwargs) -> list[str]:
|
|
117
|
+
"""References to excluded items."""
|
|
118
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
119
|
+
params_json = params.model_dump_json()
|
|
120
|
+
refs = self._excluded_refs_cache.get(params_json)
|
|
121
|
+
if refs is None:
|
|
122
|
+
refs = [
|
|
123
|
+
item.self_ref
|
|
124
|
+
for ix, (item, _) in enumerate(
|
|
125
|
+
self.doc.iterate_items(
|
|
126
|
+
with_groups=True,
|
|
127
|
+
traverse_pictures=True,
|
|
128
|
+
included_content_layers=params.layers,
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
if (
|
|
132
|
+
(ix < params.start_idx or ix >= params.stop_idx)
|
|
133
|
+
or (
|
|
134
|
+
isinstance(item, DocItem)
|
|
135
|
+
and (
|
|
136
|
+
item.label not in params.labels
|
|
137
|
+
or item.content_layer not in params.layers
|
|
138
|
+
or (
|
|
139
|
+
params.pages is not None
|
|
140
|
+
and (
|
|
141
|
+
(not item.prov)
|
|
142
|
+
or item.prov[0].page_no not in params.pages
|
|
143
|
+
)
|
|
115
144
|
)
|
|
116
145
|
)
|
|
117
146
|
)
|
|
118
147
|
)
|
|
119
|
-
|
|
120
|
-
|
|
148
|
+
]
|
|
149
|
+
self._excluded_refs_cache[params_json] = refs
|
|
121
150
|
return refs
|
|
122
151
|
|
|
123
|
-
@
|
|
124
|
-
def
|
|
125
|
-
"""
|
|
126
|
-
|
|
152
|
+
@abstractmethod
|
|
153
|
+
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
|
|
154
|
+
"""Serialize a page out of its parts."""
|
|
155
|
+
...
|
|
156
|
+
|
|
157
|
+
@abstractmethod
|
|
158
|
+
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
|
|
159
|
+
"""Serialize a document out of its pages."""
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def _serialize_body(self) -> SerializationResult:
|
|
163
|
+
"""Serialize the document body."""
|
|
164
|
+
# find page ranges if available; otherwise regard whole doc as a single page
|
|
165
|
+
last_page: Optional[int] = None
|
|
166
|
+
starts: list[int] = []
|
|
167
|
+
for ix, (item, _) in enumerate(
|
|
168
|
+
self.doc.iterate_items(
|
|
169
|
+
with_groups=True,
|
|
170
|
+
traverse_pictures=True,
|
|
171
|
+
included_content_layers=self.params.layers,
|
|
172
|
+
)
|
|
173
|
+
):
|
|
174
|
+
if isinstance(item, DocItem):
|
|
175
|
+
if item.prov:
|
|
176
|
+
if last_page is None or item.prov[0].page_no > last_page:
|
|
177
|
+
starts.append(ix)
|
|
178
|
+
last_page = item.prov[0].page_no
|
|
179
|
+
page_ranges = [
|
|
180
|
+
(
|
|
181
|
+
(starts[i] if i > 0 else 0),
|
|
182
|
+
(starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
|
|
183
|
+
)
|
|
184
|
+
for i, _ in enumerate(starts)
|
|
185
|
+
] or [
|
|
186
|
+
(0, sys.maxsize)
|
|
187
|
+
] # use whole range if no pages detected
|
|
188
|
+
|
|
189
|
+
page_results: list[SerializationResult] = []
|
|
190
|
+
for page_range in page_ranges:
|
|
191
|
+
params_to_pass = deepcopy(self.params)
|
|
192
|
+
params_to_pass.start_idx = page_range[0]
|
|
193
|
+
params_to_pass.stop_idx = page_range[1]
|
|
194
|
+
subparts = self.get_parts(**params_to_pass.model_dump())
|
|
195
|
+
page_res = self.serialize_page(subparts)
|
|
196
|
+
page_results.append(page_res)
|
|
197
|
+
res = self.serialize_doc(page_results)
|
|
198
|
+
return res
|
|
127
199
|
|
|
128
|
-
# making some assumptions about the kwargs it can pass
|
|
129
200
|
@override
|
|
130
|
-
def
|
|
201
|
+
def serialize(
|
|
131
202
|
self,
|
|
132
|
-
node: Optional[NodeItem] = None,
|
|
133
203
|
*,
|
|
134
|
-
|
|
204
|
+
item: Optional[NodeItem] = None,
|
|
135
205
|
list_level: int = 0,
|
|
136
206
|
is_inline_scope: bool = False,
|
|
137
207
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
138
208
|
**kwargs,
|
|
139
|
-
) ->
|
|
140
|
-
"""
|
|
209
|
+
) -> SerializationResult:
|
|
210
|
+
"""Serialize a given node."""
|
|
141
211
|
my_visited: set[str] = visited if visited is not None else set()
|
|
142
|
-
|
|
212
|
+
empty_res = SerializationResult(text="")
|
|
213
|
+
if item is None or item == self.doc.body:
|
|
214
|
+
if self.doc.body.self_ref not in my_visited:
|
|
215
|
+
my_visited.add(self.doc.body.self_ref)
|
|
216
|
+
return self._serialize_body()
|
|
217
|
+
else:
|
|
218
|
+
return empty_res
|
|
143
219
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
220
|
+
########
|
|
221
|
+
# groups
|
|
222
|
+
########
|
|
223
|
+
if isinstance(item, (UnorderedList, OrderedList)):
|
|
224
|
+
part = self.list_serializer.serialize(
|
|
225
|
+
item=item,
|
|
226
|
+
doc_serializer=self,
|
|
227
|
+
doc=self.doc,
|
|
228
|
+
list_level=list_level,
|
|
229
|
+
is_inline_scope=is_inline_scope,
|
|
230
|
+
visited=my_visited,
|
|
231
|
+
**kwargs,
|
|
155
232
|
)
|
|
156
|
-
):
|
|
157
|
-
|
|
158
|
-
|
|
233
|
+
elif isinstance(item, InlineGroup):
|
|
234
|
+
part = self.inline_serializer.serialize(
|
|
235
|
+
item=item,
|
|
236
|
+
doc_serializer=self,
|
|
237
|
+
doc=self.doc,
|
|
238
|
+
list_level=list_level,
|
|
239
|
+
visited=my_visited,
|
|
240
|
+
**kwargs,
|
|
241
|
+
)
|
|
242
|
+
###########
|
|
243
|
+
# doc items
|
|
244
|
+
###########
|
|
245
|
+
elif isinstance(item, TextItem):
|
|
246
|
+
if item.self_ref in self._captions_of_some_item:
|
|
247
|
+
# those captions will be handled by the floating item holding them
|
|
248
|
+
return empty_res
|
|
159
249
|
else:
|
|
160
|
-
my_visited.add(item.self_ref)
|
|
161
|
-
|
|
162
|
-
########
|
|
163
|
-
# groups
|
|
164
|
-
########
|
|
165
|
-
if isinstance(item, (UnorderedList, OrderedList)):
|
|
166
|
-
part = self.list_serializer.serialize(
|
|
167
|
-
item=item,
|
|
168
|
-
doc_serializer=self,
|
|
169
|
-
doc=self.doc,
|
|
170
|
-
list_level=list_level,
|
|
171
|
-
is_inline_scope=is_inline_scope,
|
|
172
|
-
visited=my_visited,
|
|
173
|
-
)
|
|
174
|
-
elif isinstance(item, InlineGroup):
|
|
175
|
-
part = self.inline_serializer.serialize(
|
|
176
|
-
item=item,
|
|
177
|
-
doc_serializer=self,
|
|
178
|
-
doc=self.doc,
|
|
179
|
-
list_level=list_level,
|
|
180
|
-
visited=my_visited,
|
|
181
|
-
)
|
|
182
|
-
###########
|
|
183
|
-
# doc items
|
|
184
|
-
###########
|
|
185
|
-
elif isinstance(item, DocItem) and item.label in label_blocklist:
|
|
186
|
-
continue
|
|
187
|
-
elif isinstance(item, TextItem):
|
|
188
250
|
part = (
|
|
189
251
|
self.text_serializer.serialize(
|
|
190
252
|
item=item,
|
|
191
253
|
doc_serializer=self,
|
|
192
254
|
doc=self.doc,
|
|
193
255
|
is_inline_scope=is_inline_scope,
|
|
256
|
+
**kwargs,
|
|
194
257
|
)
|
|
195
|
-
if item.self_ref not in self.get_excluded_refs()
|
|
196
|
-
else
|
|
197
|
-
)
|
|
198
|
-
elif isinstance(item, TableItem):
|
|
199
|
-
part = self.table_serializer.serialize(
|
|
200
|
-
item=item,
|
|
201
|
-
doc_serializer=self,
|
|
202
|
-
doc=self.doc,
|
|
203
|
-
)
|
|
204
|
-
elif isinstance(item, PictureItem):
|
|
205
|
-
part = self.picture_serializer.serialize(
|
|
206
|
-
item=item,
|
|
207
|
-
doc_serializer=self,
|
|
208
|
-
doc=self.doc,
|
|
209
|
-
visited=my_visited,
|
|
210
|
-
image_mode=self.image_mode,
|
|
211
|
-
image_placeholder=self.image_placeholder,
|
|
212
|
-
)
|
|
213
|
-
elif isinstance(item, KeyValueItem):
|
|
214
|
-
part = self.key_value_serializer.serialize(
|
|
215
|
-
item=item,
|
|
216
|
-
doc_serializer=self,
|
|
217
|
-
doc=self.doc,
|
|
218
|
-
)
|
|
219
|
-
elif isinstance(item, FormItem):
|
|
220
|
-
part = self.form_serializer.serialize(
|
|
221
|
-
item=item,
|
|
222
|
-
doc_serializer=self,
|
|
223
|
-
doc=self.doc,
|
|
258
|
+
if item.self_ref not in self.get_excluded_refs(**kwargs)
|
|
259
|
+
else empty_res
|
|
224
260
|
)
|
|
261
|
+
elif isinstance(item, TableItem):
|
|
262
|
+
part = self.table_serializer.serialize(
|
|
263
|
+
item=item,
|
|
264
|
+
doc_serializer=self,
|
|
265
|
+
doc=self.doc,
|
|
266
|
+
**kwargs,
|
|
267
|
+
)
|
|
268
|
+
elif isinstance(item, PictureItem):
|
|
269
|
+
part = self.picture_serializer.serialize(
|
|
270
|
+
item=item,
|
|
271
|
+
doc_serializer=self,
|
|
272
|
+
doc=self.doc,
|
|
273
|
+
visited=my_visited,
|
|
274
|
+
**kwargs,
|
|
275
|
+
)
|
|
276
|
+
elif isinstance(item, KeyValueItem):
|
|
277
|
+
part = self.key_value_serializer.serialize(
|
|
278
|
+
item=item,
|
|
279
|
+
doc_serializer=self,
|
|
280
|
+
doc=self.doc,
|
|
281
|
+
**kwargs,
|
|
282
|
+
)
|
|
283
|
+
elif isinstance(item, FormItem):
|
|
284
|
+
part = self.form_serializer.serialize(
|
|
285
|
+
item=item,
|
|
286
|
+
doc_serializer=self,
|
|
287
|
+
doc=self.doc,
|
|
288
|
+
**kwargs,
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
part = self.fallback_serializer.serialize(
|
|
292
|
+
item=item,
|
|
293
|
+
doc_serializer=self,
|
|
294
|
+
doc=self.doc,
|
|
295
|
+
**kwargs,
|
|
296
|
+
)
|
|
297
|
+
return part
|
|
298
|
+
|
|
299
|
+
# making some assumptions about the kwargs it can pass
|
|
300
|
+
@override
|
|
301
|
+
def get_parts(
|
|
302
|
+
self,
|
|
303
|
+
item: Optional[NodeItem] = None,
|
|
304
|
+
*,
|
|
305
|
+
traverse_pictures: bool = False,
|
|
306
|
+
list_level: int = 0,
|
|
307
|
+
is_inline_scope: bool = False,
|
|
308
|
+
visited: Optional[set[str]] = None, # refs of visited items
|
|
309
|
+
**kwargs,
|
|
310
|
+
) -> list[SerializationResult]:
|
|
311
|
+
"""Get the components to be combined for serializing this node."""
|
|
312
|
+
parts: list[SerializationResult] = []
|
|
313
|
+
my_visited: set[str] = visited if visited is not None else set()
|
|
314
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
315
|
+
for item, _ in self.doc.iterate_items(
|
|
316
|
+
root=item,
|
|
317
|
+
with_groups=True,
|
|
318
|
+
traverse_pictures=traverse_pictures,
|
|
319
|
+
included_content_layers=params.layers,
|
|
320
|
+
):
|
|
321
|
+
if item.self_ref in my_visited:
|
|
322
|
+
continue
|
|
225
323
|
else:
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
324
|
+
my_visited.add(item.self_ref)
|
|
325
|
+
part = self.serialize(
|
|
326
|
+
item=item,
|
|
327
|
+
list_level=list_level,
|
|
328
|
+
is_inline_scope=is_inline_scope,
|
|
329
|
+
visited=my_visited,
|
|
330
|
+
**kwargs,
|
|
331
|
+
)
|
|
231
332
|
if part.text:
|
|
232
333
|
parts.append(part)
|
|
233
334
|
return parts
|
|
@@ -242,8 +343,9 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
242
343
|
**kwargs,
|
|
243
344
|
) -> str:
|
|
244
345
|
"""Apply some text post-processing steps."""
|
|
346
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
245
347
|
res = text
|
|
246
|
-
if
|
|
348
|
+
if params.include_formatting and formatting:
|
|
247
349
|
if formatting.bold:
|
|
248
350
|
res = self.serialize_bold(text=res)
|
|
249
351
|
if formatting.italic:
|
|
@@ -252,7 +354,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
252
354
|
res = self.serialize_underline(text=res)
|
|
253
355
|
if formatting.strikethrough:
|
|
254
356
|
res = self.serialize_strikethrough(text=res)
|
|
255
|
-
if
|
|
357
|
+
if params.include_hyperlinks and hyperlink:
|
|
256
358
|
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
|
|
257
359
|
return res
|
|
258
360
|
|
|
@@ -287,67 +389,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
287
389
|
def serialize_captions(
|
|
288
390
|
self,
|
|
289
391
|
item: FloatingItem,
|
|
290
|
-
separator: Optional[str] = None,
|
|
291
392
|
**kwargs,
|
|
292
393
|
) -> SerializationResult:
|
|
293
394
|
"""Serialize the item's captions."""
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
"""Class for picture serializers."""
|
|
307
|
-
|
|
308
|
-
# helper function
|
|
309
|
-
def _serialize_content(
|
|
310
|
-
self,
|
|
311
|
-
item: PictureItem,
|
|
312
|
-
doc_serializer: "BaseDocSerializer",
|
|
313
|
-
doc: DoclingDocument,
|
|
314
|
-
separator: Optional[str] = None,
|
|
315
|
-
visited: Optional[set[str]] = None,
|
|
316
|
-
**kwargs,
|
|
317
|
-
) -> SerializationResult:
|
|
318
|
-
parts = doc_serializer.get_parts(
|
|
319
|
-
node=item,
|
|
320
|
-
traverse_pictures=True,
|
|
321
|
-
visited=visited,
|
|
322
|
-
)
|
|
323
|
-
text_res = (separator or " ").join([p.text for p in parts])
|
|
324
|
-
# NOTE: we do no postprocessing since already done as needed
|
|
325
|
-
return SerializationResult(text=text_res)
|
|
326
|
-
|
|
327
|
-
# helper function
|
|
328
|
-
def _serialize_annotations(
|
|
329
|
-
self,
|
|
330
|
-
item: PictureItem,
|
|
331
|
-
doc_serializer: "BaseDocSerializer",
|
|
332
|
-
doc: DoclingDocument,
|
|
333
|
-
separator: Optional[str] = None,
|
|
334
|
-
**kwargs,
|
|
335
|
-
) -> SerializationResult:
|
|
336
|
-
text_parts: list[str] = []
|
|
337
|
-
for annotation in item.annotations:
|
|
338
|
-
if isinstance(annotation, PictureClassificationData):
|
|
339
|
-
predicted_class = (
|
|
340
|
-
annotation.predicted_classes[0].class_name
|
|
341
|
-
if annotation.predicted_classes
|
|
342
|
-
else None
|
|
343
|
-
)
|
|
344
|
-
if predicted_class is not None:
|
|
345
|
-
text_parts.append(f"Picture type: {predicted_class}")
|
|
346
|
-
elif isinstance(annotation, PictureMoleculeData):
|
|
347
|
-
text_parts.append(f"SMILES: {annotation.smi}")
|
|
348
|
-
elif isinstance(annotation, PictureDescriptionData):
|
|
349
|
-
text_parts.append(f"Description: {annotation.text}")
|
|
350
|
-
|
|
351
|
-
text_res = (separator or "\n").join(text_parts)
|
|
352
|
-
text_res = doc_serializer.post_process(text=text_res)
|
|
395
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
396
|
+
if DocItemLabel.CAPTION in params.labels:
|
|
397
|
+
text_parts: list[str] = [
|
|
398
|
+
it.text
|
|
399
|
+
for cap in item.captions
|
|
400
|
+
if isinstance(it := cap.resolve(self.doc), TextItem)
|
|
401
|
+
and it.self_ref not in self.get_excluded_refs(**kwargs)
|
|
402
|
+
]
|
|
403
|
+
text_res = params.caption_delim.join(text_parts)
|
|
404
|
+
text_res = self.post_process(text=text_res)
|
|
405
|
+
else:
|
|
406
|
+
text_res = ""
|
|
353
407
|
return SerializationResult(text=text_res)
|