docling-core 2.25.0__py3-none-any.whl → 2.26.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -18,7 +18,11 @@ from docling_core.experimental.serializer.base import (
18
18
  BaseTextSerializer,
19
19
  SerializationResult,
20
20
  )
21
- from docling_core.experimental.serializer.common import CommonParams, DocSerializer
21
+ from docling_core.experimental.serializer.common import (
22
+ CommonParams,
23
+ DocSerializer,
24
+ create_ser_result,
25
+ )
22
26
  from docling_core.types.doc.document import (
23
27
  CodeItem,
24
28
  DocItem,
@@ -33,10 +37,12 @@ from docling_core.types.doc.document import (
33
37
  PictureClassificationData,
34
38
  PictureItem,
35
39
  PictureMoleculeData,
40
+ PictureTabularChartData,
36
41
  TableItem,
37
42
  TextItem,
38
43
  UnorderedList,
39
44
  )
45
+ from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
40
46
  from docling_core.types.doc.tokens import DocumentToken
41
47
 
42
48
 
@@ -135,7 +141,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
135
141
  text_res = "".join(parts)
136
142
  if wrap_tag is not None:
137
143
  text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
138
- return SerializationResult(text=text_res)
144
+ return create_ser_result(text=text_res, span_source=item)
139
145
 
140
146
 
141
147
  class DocTagsTableSerializer(BaseTableSerializer):
@@ -153,7 +159,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
153
159
  """Serializes the passed item."""
154
160
  params = DocTagsParams(**kwargs)
155
161
 
156
- parts: list[str] = []
162
+ res_parts: list[SerializationResult] = []
157
163
 
158
164
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
159
165
  if params.add_location:
@@ -162,7 +168,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
162
168
  xsize=params.xsize,
163
169
  ysize=params.ysize,
164
170
  )
165
- parts.append(loc_text)
171
+ res_parts.append(create_ser_result(text=loc_text, span_source=item))
166
172
 
167
173
  otsl_text = item.export_to_otsl(
168
174
  doc=doc,
@@ -171,18 +177,18 @@ class DocTagsTableSerializer(BaseTableSerializer):
171
177
  xsize=params.xsize,
172
178
  ysize=params.ysize,
173
179
  )
174
- parts.append(otsl_text)
180
+ res_parts.append(create_ser_result(text=otsl_text, span_source=item))
175
181
 
176
182
  if params.add_caption:
177
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
178
- if cap_text:
179
- parts.append(cap_text)
183
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
184
+ if cap_res.text:
185
+ res_parts.append(cap_res)
180
186
 
181
- text_res = "".join(parts)
187
+ text_res = "".join([r.text for r in res_parts])
182
188
  if text_res:
183
189
  text_res = _wrap(text=text_res, wrap_tag=DocumentToken.OTSL.value)
184
190
 
185
- return SerializationResult(text=text_res)
191
+ return create_ser_result(text=text_res, span_source=res_parts)
186
192
 
187
193
 
188
194
  class DocTagsPictureSerializer(BasePictureSerializer):
@@ -199,7 +205,8 @@ class DocTagsPictureSerializer(BasePictureSerializer):
199
205
  ) -> SerializationResult:
200
206
  """Serializes the passed item."""
201
207
  params = DocTagsParams(**kwargs)
202
- parts: list[str] = []
208
+ res_parts: list[SerializationResult] = []
209
+ is_chart = False
203
210
 
204
211
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
205
212
  body = ""
@@ -217,6 +224,16 @@ class DocTagsPictureSerializer(BasePictureSerializer):
217
224
  ]
218
225
  if len(classifications) > 0:
219
226
  predicted_class = classifications[0].predicted_classes[0].class_name
227
+ if predicted_class in [
228
+ PictureClassificationLabel.PIE_CHART,
229
+ PictureClassificationLabel.BAR_CHART,
230
+ PictureClassificationLabel.STACKED_BAR_CHART,
231
+ PictureClassificationLabel.LINE_CHART,
232
+ PictureClassificationLabel.FLOW_CHART,
233
+ PictureClassificationLabel.SCATTER_CHART,
234
+ PictureClassificationLabel.HEATMAP,
235
+ ]:
236
+ is_chart = True
220
237
  body += DocumentToken.get_picture_classification_token(predicted_class)
221
238
 
222
239
  smiles_annotations = [
@@ -226,20 +243,35 @@ class DocTagsPictureSerializer(BasePictureSerializer):
226
243
  body += _wrap(
227
244
  text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value
228
245
  )
229
- parts.append(body)
246
+
247
+ tabular_chart_annotations = [
248
+ ann
249
+ for ann in item.annotations
250
+ if isinstance(ann, PictureTabularChartData)
251
+ ]
252
+ if len(tabular_chart_annotations) > 0:
253
+ temp_doc = DoclingDocument(name="temp")
254
+ temp_table = temp_doc.add_table(
255
+ data=tabular_chart_annotations[0].chart_data
256
+ )
257
+ otsl_content = temp_table.export_to_otsl(
258
+ temp_doc, add_cell_location=False
259
+ )
260
+ body += otsl_content
261
+ res_parts.append(create_ser_result(text=body, span_source=item))
230
262
 
231
263
  if params.add_caption:
232
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
233
- if cap_text:
234
- parts.append(cap_text)
264
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
265
+ if cap_res.text:
266
+ res_parts.append(cap_res)
235
267
 
236
- text_res = "".join(parts)
268
+ text_res = "".join([r.text for r in res_parts])
237
269
  if text_res:
238
270
  token = DocumentToken.create_token_name_from_doc_item_label(
239
- label=item.label
271
+ label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
240
272
  )
241
273
  text_res = _wrap(text=text_res, wrap_tag=token)
242
- return SerializationResult(text=text_res)
274
+ return create_ser_result(text=text_res, span_source=res_parts)
243
275
 
244
276
 
245
277
  class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
@@ -256,8 +288,8 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
256
288
  ) -> SerializationResult:
257
289
  """Serializes the passed item."""
258
290
  params = DocTagsParams(**kwargs)
259
-
260
291
  body = ""
292
+ results: list[SerializationResult] = []
261
293
 
262
294
  page_no = 1
263
295
  if len(item.prov) > 0:
@@ -302,14 +334,16 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
302
334
  tok = f"{cell.label.value}_{cell.cell_id}"
303
335
  cell_txt = _wrap(text=cell_txt, wrap_tag=tok)
304
336
  body += cell_txt
337
+ results.append(create_ser_result(text=body, span_source=item))
305
338
 
306
339
  if params.add_caption:
307
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
308
- if cap_text:
309
- body += cap_text
340
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
341
+ if cap_res.text:
342
+ results.append(cap_res)
310
343
 
344
+ body = "".join([r.text for r in results])
311
345
  body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value)
312
- return SerializationResult(text=body)
346
+ return create_ser_result(text=body, span_source=results)
313
347
 
314
348
 
315
349
  class DocTagsFormSerializer(BaseFormSerializer):
@@ -326,8 +360,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
326
360
  ) -> SerializationResult:
327
361
  """Serializes the passed item."""
328
362
  # TODO add actual implementation
329
- text_res = ""
330
- return SerializationResult(text=text_res)
363
+ return create_ser_result()
331
364
 
332
365
 
333
366
  class DocTagsListSerializer(BaseModel, BaseListSerializer):
@@ -348,7 +381,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
348
381
  **kwargs,
349
382
  ) -> SerializationResult:
350
383
  """Serializes the passed item."""
351
- my_visited = visited or set()
384
+ my_visited = visited if visited is not None else set()
352
385
  params = DocTagsParams(**kwargs)
353
386
  parts = doc_serializer.get_parts(
354
387
  item=item,
@@ -361,8 +394,9 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
361
394
  if parts:
362
395
  text_res = delim.join(
363
396
  [
364
- _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value)
397
+ t
365
398
  for p in parts
399
+ if (t := _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value))
366
400
  ]
367
401
  )
368
402
  text_res = f"{text_res}{delim}"
@@ -374,7 +408,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
374
408
  text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
375
409
  else:
376
410
  text_res = ""
377
- return SerializationResult(text=text_res)
411
+ return create_ser_result(text=text_res, span_source=parts)
378
412
 
379
413
 
380
414
  class DocTagsInlineSerializer(BaseInlineSerializer):
@@ -392,7 +426,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
392
426
  **kwargs,
393
427
  ) -> SerializationResult:
394
428
  """Serializes the passed item."""
395
- my_visited = visited or set()
429
+ my_visited = visited if visited is not None else set()
396
430
  params = DocTagsParams(**kwargs)
397
431
  parts = doc_serializer.get_parts(
398
432
  item=item,
@@ -407,7 +441,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
407
441
  if text_res:
408
442
  text_res = f"{text_res}{delim}"
409
443
  text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
410
- return SerializationResult(text=text_res)
444
+ return create_ser_result(text=text_res, span_source=parts)
411
445
 
412
446
 
413
447
  class DocTagsFallbackSerializer(BaseFallbackSerializer):
@@ -423,8 +457,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
423
457
  **kwargs,
424
458
  ) -> SerializationResult:
425
459
  """Serializes the passed item."""
426
- text_res = ""
427
- return SerializationResult(text=text_res)
460
+ return create_ser_result()
428
461
 
429
462
 
430
463
  class DocTagsDocSerializer(DocSerializer):
@@ -443,24 +476,21 @@ class DocTagsDocSerializer(DocSerializer):
443
476
  params: DocTagsParams = DocTagsParams()
444
477
 
445
478
  @override
446
- def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
447
- """Serialize a page out of its parts."""
448
- delim = _get_delim(params=self.params)
449
- text_res = delim.join([p.text for p in parts])
450
- return SerializationResult(text=text_res)
451
-
452
- @override
453
- def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
479
+ def serialize_doc(
480
+ self, *, parts: list[SerializationResult], **kwargs
481
+ ) -> SerializationResult:
454
482
  """Serialize a document out of its pages."""
455
483
  delim = _get_delim(params=self.params)
484
+ text_res = delim.join([p.text for p in parts if p.text])
485
+
456
486
  if self.params.add_page_break:
457
- page_sep = f"{delim}<{DocumentToken.PAGE_BREAK.value}>{delim}"
458
- content = page_sep.join([p.text for p in pages if p.text])
459
- else:
460
- content = self.serialize_page(parts=pages).text
487
+ page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
488
+ for full_match, _, _ in self._get_page_breaks(text=text_res):
489
+ text_res = text_res.replace(full_match, page_sep)
490
+
461
491
  wrap_tag = DocumentToken.DOCUMENT.value
462
- text_res = f"<{wrap_tag}>{content}{delim}</{wrap_tag}>"
463
- return SerializationResult(text=text_res)
492
+ text_res = f"<{wrap_tag}>{text_res}{delim}</{wrap_tag}>"
493
+ return create_ser_result(text=text_res, span_source=parts)
464
494
 
465
495
  @override
466
496
  def serialize_captions(
@@ -470,11 +500,10 @@ class DocTagsDocSerializer(DocSerializer):
470
500
  ) -> SerializationResult:
471
501
  """Serialize the item's captions."""
472
502
  params = DocTagsParams(**kwargs)
473
- parts: list[str] = []
474
-
503
+ results: list[SerializationResult] = []
475
504
  if item.captions:
476
- cap_text = super().serialize_captions(item, **kwargs).text
477
- if cap_text:
505
+ cap_res = super().serialize_captions(item, **kwargs)
506
+ if cap_res.text:
478
507
  if params.add_location:
479
508
  for caption in item.captions:
480
509
  if caption.cref not in self.get_excluded_refs(**kwargs):
@@ -484,9 +513,14 @@ class DocTagsDocSerializer(DocSerializer):
484
513
  xsize=params.xsize,
485
514
  ysize=params.ysize,
486
515
  )
487
- parts.append(loc_txt)
488
- parts.append(cap_text)
489
- text_res = "".join(parts)
516
+ results.append(create_ser_result(text=loc_txt))
517
+ results.append(cap_res)
518
+ text_res = "".join([r.text for r in results])
490
519
  if text_res:
491
520
  text_res = _wrap(text=text_res, wrap_tag=DocumentToken.CAPTION.value)
492
- return SerializationResult(text=text_res)
521
+ return create_ser_result(text=text_res, span_source=results)
522
+
523
+ @override
524
+ def requires_page_break(self):
525
+ """Whether to add page breaks."""
526
+ return self.params.add_page_break