docling-core 2.25.0__py3-none-any.whl → 2.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -11,6 +11,7 @@ from typing import Optional, Union
11
11
  from pydantic import AnyUrl, BaseModel
12
12
 
13
13
  from docling_core.types.doc.document import (
14
+ DocItem,
14
15
  DoclingDocument,
15
16
  FloatingItem,
16
17
  FormItem,
@@ -25,10 +26,19 @@ from docling_core.types.doc.document import (
25
26
  )
26
27
 
27
28
 
29
+ class Span(BaseModel):
30
+ """Class encapsulating fine-granular document span information."""
31
+
32
+ item: DocItem
33
+ # prov_idx: Optional[PositiveInt] = None # None to be interpreted as whole DocItem
34
+
35
+
28
36
  class SerializationResult(BaseModel):
29
37
  """SerializationResult."""
30
38
 
31
- text: str
39
+ text: str = ""
40
+ spans: list[Span] = []
41
+ # group: Optional[GroupItem] = None # set when result reflects specific group item
32
42
 
33
43
 
34
44
  class BaseTextSerializer(ABC):
@@ -163,7 +173,9 @@ class BaseDocSerializer(ABC):
163
173
  """Base class for document serializers."""
164
174
 
165
175
  @abstractmethod
166
- def serialize(self, **kwargs) -> SerializationResult:
176
+ def serialize(
177
+ self, *, item: Optional[NodeItem] = None, **kwargs
178
+ ) -> SerializationResult:
167
179
  """Run the serialization."""
168
180
  ...
169
181
 
@@ -225,3 +237,12 @@ class BaseDocSerializer(ABC):
225
237
  def get_excluded_refs(self, **kwargs) -> list[str]:
226
238
  """Get references to excluded items."""
227
239
  ...
240
+
241
+
242
+ class BaseSerializerProvider(ABC):
243
+ """Base class for document serializer providers."""
244
+
245
+ @abstractmethod
246
+ def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
247
+ """Get a the associated serializer."""
248
+ ...
@@ -25,6 +25,7 @@ from docling_core.experimental.serializer.base import (
25
25
  BaseTableSerializer,
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
+ Span,
28
29
  )
29
30
  from docling_core.types.doc.document import (
30
31
  DOCUMENT_TOKENS_EXPORT_LABELS,
@@ -49,6 +50,38 @@ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
49
50
  _DEFAULT_LAYERS = {cl for cl in ContentLayer}
50
51
 
51
52
 
53
+ def create_ser_result(
54
+ *,
55
+ text: str = "",
56
+ span_source: Union[DocItem, list[SerializationResult]] = [],
57
+ ) -> SerializationResult:
58
+ """Function for creating `SerializationResult` instances.
59
+
60
+ Args:
61
+ text: the text the use. Defaults to "".
62
+ span_source: the item or list of results to use as span source. Defaults to [].
63
+
64
+ Returns:
65
+ The created `SerializationResult`.
66
+ """
67
+ spans: list[Span]
68
+ if isinstance(span_source, DocItem):
69
+ spans = [Span(item=span_source)]
70
+ else:
71
+ results: list[SerializationResult] = span_source
72
+ spans = []
73
+ span_ids: set[str] = set()
74
+ for ser_res in results:
75
+ for span in ser_res.spans:
76
+ if (span_id := span.item.self_ref) not in span_ids:
77
+ span_ids.add(span_id)
78
+ spans.append(span)
79
+ return SerializationResult(
80
+ text=text,
81
+ spans=spans,
82
+ )
83
+
84
+
52
85
  class CommonParams(BaseModel):
53
86
  """Common serialization parameters."""
54
87
 
@@ -150,20 +183,26 @@ class DocSerializer(BaseModel, BaseDocSerializer):
150
183
  return refs
151
184
 
152
185
  @abstractmethod
153
- def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
186
+ def serialize_page(
187
+ self, *, parts: list[SerializationResult], **kwargs
188
+ ) -> SerializationResult:
154
189
  """Serialize a page out of its parts."""
155
190
  ...
156
191
 
157
192
  @abstractmethod
158
- def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
193
+ def serialize_doc(
194
+ self, *, pages: dict[Optional[int], SerializationResult], **kwargs
195
+ ) -> SerializationResult:
159
196
  """Serialize a document out of its pages."""
160
197
  ...
161
198
 
162
199
  def _serialize_body(self) -> SerializationResult:
163
200
  """Serialize the document body."""
164
201
  # find page ranges if available; otherwise regard whole doc as a single page
165
- last_page: Optional[int] = None
166
- starts: list[int] = []
202
+ prev_start: int = 0
203
+ prev_page_nr: Optional[int] = None
204
+ range_by_page_nr: dict[Optional[int], tuple[int, int]] = {}
205
+
167
206
  for ix, (item, _) in enumerate(
168
207
  self.doc.iterate_items(
169
208
  with_groups=True,
@@ -173,28 +212,30 @@ class DocSerializer(BaseModel, BaseDocSerializer):
173
212
  ):
174
213
  if isinstance(item, DocItem):
175
214
  if item.prov:
176
- if last_page is None or item.prov[0].page_no > last_page:
177
- starts.append(ix)
178
- last_page = item.prov[0].page_no
179
- page_ranges = [
180
- (
181
- (starts[i] if i > 0 else 0),
182
- (starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
183
- )
184
- for i, _ in enumerate(starts)
185
- ] or [
186
- (0, sys.maxsize)
187
- ] # use whole range if no pages detected
215
+ page_no = item.prov[0].page_no
216
+ if prev_page_nr is None or page_no > prev_page_nr:
217
+ if prev_page_nr is not None: # close previous range
218
+ range_by_page_nr[prev_page_nr] = (prev_start, ix)
219
+
220
+ prev_start = ix
221
+ # could alternatively always start 1st page from 0:
222
+ # prev_start = ix if prev_page_nr is not None else 0
223
+
224
+ prev_page_nr = page_no
225
+
226
+ # close last (and single if no pages) range
227
+ range_by_page_nr[prev_page_nr] = (prev_start, sys.maxsize)
188
228
 
189
- page_results: list[SerializationResult] = []
190
- for page_range in page_ranges:
229
+ page_results: dict[Optional[int], SerializationResult] = {}
230
+ for page_nr in range_by_page_nr:
231
+ page_range = range_by_page_nr[page_nr]
191
232
  params_to_pass = deepcopy(self.params)
192
233
  params_to_pass.start_idx = page_range[0]
193
234
  params_to_pass.stop_idx = page_range[1]
194
235
  subparts = self.get_parts(**params_to_pass.model_dump())
195
- page_res = self.serialize_page(subparts)
196
- page_results.append(page_res)
197
- res = self.serialize_doc(page_results)
236
+ page_res = self.serialize_page(parts=subparts)
237
+ page_results[page_nr] = page_res
238
+ res = self.serialize_doc(pages=page_results)
198
239
  return res
199
240
 
200
241
  @override
@@ -209,7 +250,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
209
250
  ) -> SerializationResult:
210
251
  """Serialize a given node."""
211
252
  my_visited: set[str] = visited if visited is not None else set()
212
- empty_res = SerializationResult(text="")
253
+ my_kwargs = self.params.merge_with_patch(patch=kwargs).model_dump()
254
+ empty_res = create_ser_result()
213
255
  if item is None or item == self.doc.body:
214
256
  if self.doc.body.self_ref not in my_visited:
215
257
  my_visited.add(self.doc.body.self_ref)
@@ -217,6 +259,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
217
259
  else:
218
260
  return empty_res
219
261
 
262
+ my_visited.add(item.self_ref)
263
+
220
264
  ########
221
265
  # groups
222
266
  ########
@@ -228,7 +272,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
228
272
  list_level=list_level,
229
273
  is_inline_scope=is_inline_scope,
230
274
  visited=my_visited,
231
- **kwargs,
275
+ **my_kwargs,
232
276
  )
233
277
  elif isinstance(item, InlineGroup):
234
278
  part = self.inline_serializer.serialize(
@@ -237,7 +281,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
237
281
  doc=self.doc,
238
282
  list_level=list_level,
239
283
  visited=my_visited,
240
- **kwargs,
284
+ **my_kwargs,
241
285
  )
242
286
  ###########
243
287
  # doc items
@@ -253,7 +297,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
253
297
  doc_serializer=self,
254
298
  doc=self.doc,
255
299
  is_inline_scope=is_inline_scope,
256
- **kwargs,
300
+ **my_kwargs,
257
301
  )
258
302
  if item.self_ref not in self.get_excluded_refs(**kwargs)
259
303
  else empty_res
@@ -263,7 +307,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
263
307
  item=item,
264
308
  doc_serializer=self,
265
309
  doc=self.doc,
266
- **kwargs,
310
+ **my_kwargs,
267
311
  )
268
312
  elif isinstance(item, PictureItem):
269
313
  part = self.picture_serializer.serialize(
@@ -271,28 +315,28 @@ class DocSerializer(BaseModel, BaseDocSerializer):
271
315
  doc_serializer=self,
272
316
  doc=self.doc,
273
317
  visited=my_visited,
274
- **kwargs,
318
+ **my_kwargs,
275
319
  )
276
320
  elif isinstance(item, KeyValueItem):
277
321
  part = self.key_value_serializer.serialize(
278
322
  item=item,
279
323
  doc_serializer=self,
280
324
  doc=self.doc,
281
- **kwargs,
325
+ **my_kwargs,
282
326
  )
283
327
  elif isinstance(item, FormItem):
284
328
  part = self.form_serializer.serialize(
285
329
  item=item,
286
330
  doc_serializer=self,
287
331
  doc=self.doc,
288
- **kwargs,
332
+ **my_kwargs,
289
333
  )
290
334
  else:
291
335
  part = self.fallback_serializer.serialize(
292
336
  item=item,
293
337
  doc_serializer=self,
294
338
  doc=self.doc,
295
- **kwargs,
339
+ **my_kwargs,
296
340
  )
297
341
  return part
298
342
 
@@ -393,15 +437,16 @@ class DocSerializer(BaseModel, BaseDocSerializer):
393
437
  ) -> SerializationResult:
394
438
  """Serialize the item's captions."""
395
439
  params = self.params.merge_with_patch(patch=kwargs)
440
+ results: list[SerializationResult] = []
396
441
  if DocItemLabel.CAPTION in params.labels:
397
- text_parts: list[str] = [
398
- it.text
442
+ results = [
443
+ create_ser_result(text=it.text, span_source=it)
399
444
  for cap in item.captions
400
445
  if isinstance(it := cap.resolve(self.doc), TextItem)
401
446
  and it.self_ref not in self.get_excluded_refs(**kwargs)
402
447
  ]
403
- text_res = params.caption_delim.join(text_parts)
448
+ text_res = params.caption_delim.join([r.text for r in results])
404
449
  text_res = self.post_process(text=text_res)
405
450
  else:
406
451
  text_res = ""
407
- return SerializationResult(text=text_res)
452
+ return create_ser_result(text=text_res, span_source=results)
@@ -18,7 +18,11 @@ from docling_core.experimental.serializer.base import (
18
18
  BaseTextSerializer,
19
19
  SerializationResult,
20
20
  )
21
- from docling_core.experimental.serializer.common import CommonParams, DocSerializer
21
+ from docling_core.experimental.serializer.common import (
22
+ CommonParams,
23
+ DocSerializer,
24
+ create_ser_result,
25
+ )
22
26
  from docling_core.types.doc.document import (
23
27
  CodeItem,
24
28
  DocItem,
@@ -33,10 +37,12 @@ from docling_core.types.doc.document import (
33
37
  PictureClassificationData,
34
38
  PictureItem,
35
39
  PictureMoleculeData,
40
+ PictureTabularChartData,
36
41
  TableItem,
37
42
  TextItem,
38
43
  UnorderedList,
39
44
  )
45
+ from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
40
46
  from docling_core.types.doc.tokens import DocumentToken
41
47
 
42
48
 
@@ -135,7 +141,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
135
141
  text_res = "".join(parts)
136
142
  if wrap_tag is not None:
137
143
  text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
138
- return SerializationResult(text=text_res)
144
+ return create_ser_result(text=text_res, span_source=item)
139
145
 
140
146
 
141
147
  class DocTagsTableSerializer(BaseTableSerializer):
@@ -153,7 +159,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
153
159
  """Serializes the passed item."""
154
160
  params = DocTagsParams(**kwargs)
155
161
 
156
- parts: list[str] = []
162
+ res_parts: list[SerializationResult] = []
157
163
 
158
164
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
159
165
  if params.add_location:
@@ -162,7 +168,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
162
168
  xsize=params.xsize,
163
169
  ysize=params.ysize,
164
170
  )
165
- parts.append(loc_text)
171
+ res_parts.append(create_ser_result(text=loc_text, span_source=item))
166
172
 
167
173
  otsl_text = item.export_to_otsl(
168
174
  doc=doc,
@@ -171,18 +177,18 @@ class DocTagsTableSerializer(BaseTableSerializer):
171
177
  xsize=params.xsize,
172
178
  ysize=params.ysize,
173
179
  )
174
- parts.append(otsl_text)
180
+ res_parts.append(create_ser_result(text=otsl_text, span_source=item))
175
181
 
176
182
  if params.add_caption:
177
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
178
- if cap_text:
179
- parts.append(cap_text)
183
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
184
+ if cap_res.text:
185
+ res_parts.append(cap_res)
180
186
 
181
- text_res = "".join(parts)
187
+ text_res = "".join([r.text for r in res_parts])
182
188
  if text_res:
183
189
  text_res = _wrap(text=text_res, wrap_tag=DocumentToken.OTSL.value)
184
190
 
185
- return SerializationResult(text=text_res)
191
+ return create_ser_result(text=text_res, span_source=res_parts)
186
192
 
187
193
 
188
194
  class DocTagsPictureSerializer(BasePictureSerializer):
@@ -199,7 +205,8 @@ class DocTagsPictureSerializer(BasePictureSerializer):
199
205
  ) -> SerializationResult:
200
206
  """Serializes the passed item."""
201
207
  params = DocTagsParams(**kwargs)
202
- parts: list[str] = []
208
+ res_parts: list[SerializationResult] = []
209
+ is_chart = False
203
210
 
204
211
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
205
212
  body = ""
@@ -217,6 +224,16 @@ class DocTagsPictureSerializer(BasePictureSerializer):
217
224
  ]
218
225
  if len(classifications) > 0:
219
226
  predicted_class = classifications[0].predicted_classes[0].class_name
227
+ if predicted_class in [
228
+ PictureClassificationLabel.PIE_CHART,
229
+ PictureClassificationLabel.BAR_CHART,
230
+ PictureClassificationLabel.STACKED_BAR_CHART,
231
+ PictureClassificationLabel.LINE_CHART,
232
+ PictureClassificationLabel.FLOW_CHART,
233
+ PictureClassificationLabel.SCATTER_CHART,
234
+ PictureClassificationLabel.HEATMAP,
235
+ ]:
236
+ is_chart = True
220
237
  body += DocumentToken.get_picture_classification_token(predicted_class)
221
238
 
222
239
  smiles_annotations = [
@@ -226,20 +243,35 @@ class DocTagsPictureSerializer(BasePictureSerializer):
226
243
  body += _wrap(
227
244
  text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value
228
245
  )
229
- parts.append(body)
246
+
247
+ tabular_chart_annotations = [
248
+ ann
249
+ for ann in item.annotations
250
+ if isinstance(ann, PictureTabularChartData)
251
+ ]
252
+ if len(tabular_chart_annotations) > 0:
253
+ temp_doc = DoclingDocument(name="temp")
254
+ temp_table = temp_doc.add_table(
255
+ data=tabular_chart_annotations[0].chart_data
256
+ )
257
+ otsl_content = temp_table.export_to_otsl(
258
+ temp_doc, add_cell_location=False
259
+ )
260
+ body += otsl_content
261
+ res_parts.append(create_ser_result(text=body, span_source=item))
230
262
 
231
263
  if params.add_caption:
232
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
233
- if cap_text:
234
- parts.append(cap_text)
264
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
265
+ if cap_res.text:
266
+ res_parts.append(cap_res)
235
267
 
236
- text_res = "".join(parts)
268
+ text_res = "".join([r.text for r in res_parts])
237
269
  if text_res:
238
270
  token = DocumentToken.create_token_name_from_doc_item_label(
239
- label=item.label
271
+ label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
240
272
  )
241
273
  text_res = _wrap(text=text_res, wrap_tag=token)
242
- return SerializationResult(text=text_res)
274
+ return create_ser_result(text=text_res, span_source=res_parts)
243
275
 
244
276
 
245
277
  class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
@@ -256,8 +288,8 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
256
288
  ) -> SerializationResult:
257
289
  """Serializes the passed item."""
258
290
  params = DocTagsParams(**kwargs)
259
-
260
291
  body = ""
292
+ results: list[SerializationResult] = []
261
293
 
262
294
  page_no = 1
263
295
  if len(item.prov) > 0:
@@ -302,14 +334,16 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
302
334
  tok = f"{cell.label.value}_{cell.cell_id}"
303
335
  cell_txt = _wrap(text=cell_txt, wrap_tag=tok)
304
336
  body += cell_txt
337
+ results.append(create_ser_result(text=body, span_source=item))
305
338
 
306
339
  if params.add_caption:
307
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
308
- if cap_text:
309
- body += cap_text
340
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
341
+ if cap_res.text:
342
+ results.append(cap_res)
310
343
 
344
+ body = "".join([r.text for r in results])
311
345
  body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value)
312
- return SerializationResult(text=body)
346
+ return create_ser_result(text=body, span_source=results)
313
347
 
314
348
 
315
349
  class DocTagsFormSerializer(BaseFormSerializer):
@@ -326,8 +360,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
326
360
  ) -> SerializationResult:
327
361
  """Serializes the passed item."""
328
362
  # TODO add actual implementation
329
- text_res = ""
330
- return SerializationResult(text=text_res)
363
+ return create_ser_result()
331
364
 
332
365
 
333
366
  class DocTagsListSerializer(BaseModel, BaseListSerializer):
@@ -348,7 +381,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
348
381
  **kwargs,
349
382
  ) -> SerializationResult:
350
383
  """Serializes the passed item."""
351
- my_visited = visited or set()
384
+ my_visited = visited if visited is not None else set()
352
385
  params = DocTagsParams(**kwargs)
353
386
  parts = doc_serializer.get_parts(
354
387
  item=item,
@@ -361,8 +394,9 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
361
394
  if parts:
362
395
  text_res = delim.join(
363
396
  [
364
- _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value)
397
+ t
365
398
  for p in parts
399
+ if (t := _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value))
366
400
  ]
367
401
  )
368
402
  text_res = f"{text_res}{delim}"
@@ -374,7 +408,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
374
408
  text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
375
409
  else:
376
410
  text_res = ""
377
- return SerializationResult(text=text_res)
411
+ return create_ser_result(text=text_res, span_source=parts)
378
412
 
379
413
 
380
414
  class DocTagsInlineSerializer(BaseInlineSerializer):
@@ -392,7 +426,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
392
426
  **kwargs,
393
427
  ) -> SerializationResult:
394
428
  """Serializes the passed item."""
395
- my_visited = visited or set()
429
+ my_visited = visited if visited is not None else set()
396
430
  params = DocTagsParams(**kwargs)
397
431
  parts = doc_serializer.get_parts(
398
432
  item=item,
@@ -407,7 +441,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
407
441
  if text_res:
408
442
  text_res = f"{text_res}{delim}"
409
443
  text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
410
- return SerializationResult(text=text_res)
444
+ return create_ser_result(text=text_res, span_source=parts)
411
445
 
412
446
 
413
447
  class DocTagsFallbackSerializer(BaseFallbackSerializer):
@@ -423,8 +457,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
423
457
  **kwargs,
424
458
  ) -> SerializationResult:
425
459
  """Serializes the passed item."""
426
- text_res = ""
427
- return SerializationResult(text=text_res)
460
+ return create_ser_result()
428
461
 
429
462
 
430
463
  class DocTagsDocSerializer(DocSerializer):
@@ -443,24 +476,28 @@ class DocTagsDocSerializer(DocSerializer):
443
476
  params: DocTagsParams = DocTagsParams()
444
477
 
445
478
  @override
446
- def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
479
+ def serialize_page(
480
+ self, *, parts: list[SerializationResult], **kwargs
481
+ ) -> SerializationResult:
447
482
  """Serialize a page out of its parts."""
448
483
  delim = _get_delim(params=self.params)
449
- text_res = delim.join([p.text for p in parts])
450
- return SerializationResult(text=text_res)
484
+ text_res = delim.join([p.text for p in parts if p.text])
485
+ return create_ser_result(text=text_res, span_source=parts)
451
486
 
452
487
  @override
453
- def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
488
+ def serialize_doc(
489
+ self, *, pages: dict[Optional[int], SerializationResult], **kwargs
490
+ ) -> SerializationResult:
454
491
  """Serialize a document out of its pages."""
455
492
  delim = _get_delim(params=self.params)
456
493
  if self.params.add_page_break:
457
494
  page_sep = f"{delim}<{DocumentToken.PAGE_BREAK.value}>{delim}"
458
- content = page_sep.join([p.text for p in pages if p.text])
495
+ content = page_sep.join([text for k in pages if (text := pages[k].text)])
459
496
  else:
460
- content = self.serialize_page(parts=pages).text
497
+ content = self.serialize_page(parts=list(pages.values())).text
461
498
  wrap_tag = DocumentToken.DOCUMENT.value
462
499
  text_res = f"<{wrap_tag}>{content}{delim}</{wrap_tag}>"
463
- return SerializationResult(text=text_res)
500
+ return create_ser_result(text=text_res, span_source=list(pages.values()))
464
501
 
465
502
  @override
466
503
  def serialize_captions(
@@ -470,11 +507,10 @@ class DocTagsDocSerializer(DocSerializer):
470
507
  ) -> SerializationResult:
471
508
  """Serialize the item's captions."""
472
509
  params = DocTagsParams(**kwargs)
473
- parts: list[str] = []
474
-
510
+ results: list[SerializationResult] = []
475
511
  if item.captions:
476
- cap_text = super().serialize_captions(item, **kwargs).text
477
- if cap_text:
512
+ cap_res = super().serialize_captions(item, **kwargs)
513
+ if cap_res.text:
478
514
  if params.add_location:
479
515
  for caption in item.captions:
480
516
  if caption.cref not in self.get_excluded_refs(**kwargs):
@@ -484,9 +520,9 @@ class DocTagsDocSerializer(DocSerializer):
484
520
  xsize=params.xsize,
485
521
  ysize=params.ysize,
486
522
  )
487
- parts.append(loc_txt)
488
- parts.append(cap_text)
489
- text_res = "".join(parts)
523
+ results.append(create_ser_result(text=loc_txt))
524
+ results.append(cap_res)
525
+ text_res = "".join([r.text for r in results])
490
526
  if text_res:
491
527
  text_res = _wrap(text=text_res, wrap_tag=DocumentToken.CAPTION.value)
492
- return SerializationResult(text=text_res)
528
+ return create_ser_result(text=text_res, span_source=results)