docling-core 2.33.0__tar.gz → 2.34.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (106) hide show
  1. {docling_core-2.33.0 → docling_core-2.34.0}/PKG-INFO +2 -2
  2. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/serializer/base.py +34 -0
  3. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/serializer/common.py +37 -3
  4. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/serializer/doctags.py +65 -6
  5. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/serializer/html.py +61 -23
  6. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/serializer/markdown.py +85 -18
  7. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/doc/document.py +217 -43
  8. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core.egg-info/PKG-INFO +2 -2
  9. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core.egg-info/requires.txt +1 -1
  10. {docling_core-2.33.0 → docling_core-2.34.0}/pyproject.toml +2 -2
  11. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_docling_doc.py +34 -1
  12. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_doctags_load.py +23 -0
  13. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_serialization.py +69 -5
  14. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_visualization.py +12 -0
  15. {docling_core-2.33.0 → docling_core-2.34.0}/LICENSE +0 -0
  16. {docling_core-2.33.0 → docling_core-2.34.0}/README.md +0 -0
  17. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/__init__.py +0 -0
  18. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/cli/__init__.py +0 -0
  19. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/cli/view.py +0 -0
  20. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/experimental/__init__.py +0 -0
  21. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/py.typed +0 -0
  22. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  23. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  24. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  25. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  26. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  27. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  28. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  29. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  30. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/search/__init__.py +0 -0
  31. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  32. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/search/mapping.py +0 -0
  33. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/search/meta.py +0 -0
  34. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/search/package.py +0 -0
  35. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/__init__.py +0 -0
  36. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/__init__.py +0 -0
  37. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/base.py +0 -0
  38. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  39. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  40. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  41. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  42. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  43. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  44. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/serializer/__init__.py +0 -0
  45. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  46. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  47. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/visualizer/base.py +0 -0
  48. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  49. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  50. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/__init__.py +0 -0
  51. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/base.py +0 -0
  52. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/doc/__init__.py +0 -0
  53. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/doc/base.py +0 -0
  54. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/doc/labels.py +0 -0
  55. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/doc/page.py +0 -0
  56. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/doc/tokens.py +0 -0
  57. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/doc/utils.py +0 -0
  58. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/gen/__init__.py +0 -0
  59. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/gen/generic.py +0 -0
  60. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/io/__init__.py +0 -0
  61. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  62. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/legacy_doc/base.py +0 -0
  63. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  64. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  65. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  66. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/legacy_doc/document.py +0 -0
  67. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  68. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/nlp/__init__.py +0 -0
  69. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/nlp/qa.py +0 -0
  70. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/nlp/qa_labels.py +0 -0
  71. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/rec/__init__.py +0 -0
  72. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/rec/attribute.py +0 -0
  73. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/rec/base.py +0 -0
  74. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/rec/predicate.py +0 -0
  75. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/rec/record.py +0 -0
  76. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/rec/statement.py +0 -0
  77. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/types/rec/subject.py +0 -0
  78. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/__init__.py +0 -0
  79. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/alias.py +0 -0
  80. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/file.py +0 -0
  81. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/generate_docs.py +0 -0
  82. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/generate_jsonschema.py +0 -0
  83. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/legacy.py +0 -0
  84. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/validate.py +0 -0
  85. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core/utils/validators.py +0 -0
  86. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core.egg-info/SOURCES.txt +0 -0
  87. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core.egg-info/dependency_links.txt +0 -0
  88. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core.egg-info/entry_points.txt +0 -0
  89. {docling_core-2.33.0 → docling_core-2.34.0}/docling_core.egg-info/top_level.txt +0 -0
  90. {docling_core-2.33.0 → docling_core-2.34.0}/setup.cfg +0 -0
  91. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_base.py +0 -0
  92. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_collection.py +0 -0
  93. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_data_gen_flag.py +0 -0
  94. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_doc_base.py +0 -0
  95. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_doc_legacy_convert.py +0 -0
  96. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_doc_schema.py +0 -0
  97. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_doc_schema_extractor.py +0 -0
  98. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_hierarchical_chunker.py +0 -0
  99. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_hybrid_chunker.py +0 -0
  100. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_json_schema_to_search_mapper.py +0 -0
  101. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_nlp_qa.py +0 -0
  102. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_otsl_table_export.py +0 -0
  103. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_page.py +0 -0
  104. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_rec_schema.py +0 -0
  105. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_search_meta.py +0 -0
  106. {docling_core-2.33.0 → docling_core-2.34.0}/test/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.33.0
3
+ Version: 2.34.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -32,7 +32,7 @@ Requires-Dist: pandas<3.0.0,>=2.1.4
32
32
  Requires-Dist: pillow<12.0.0,>=10.0.0
33
33
  Requires-Dist: pyyaml<7.0.0,>=5.1
34
34
  Requires-Dist: typing-extensions<5.0.0,>=4.12.2
35
- Requires-Dist: typer<0.16.0,>=0.12.5
35
+ Requires-Dist: typer<0.17.0,>=0.12.5
36
36
  Requires-Dist: latex2mathml<4.0.0,>=3.77.0
37
37
  Provides-Extra: chunking
38
38
  Requires-Dist: semchunk<3.0.0,>=2.2.0; extra == "chunking"
@@ -202,6 +202,16 @@ class BaseDocSerializer(ABC):
202
202
  """Hook for strikethrough formatting serialization."""
203
203
  ...
204
204
 
205
+ @abstractmethod
206
+ def serialize_subscript(self, text: str, **kwargs: Any) -> str:
207
+ """Hook for subscript formatting serialization."""
208
+ ...
209
+
210
+ @abstractmethod
211
+ def serialize_superscript(self, text: str, **kwargs: Any) -> str:
212
+ """Hook for superscript formatting serialization."""
213
+ ...
214
+
205
215
  @abstractmethod
206
216
  def serialize_hyperlink(
207
217
  self,
@@ -239,6 +249,15 @@ class BaseDocSerializer(ABC):
239
249
  """Serialize the item's captions."""
240
250
  ...
241
251
 
252
+ @abstractmethod
253
+ def serialize_annotations(
254
+ self,
255
+ item: DocItem,
256
+ **kwargs: Any,
257
+ ) -> SerializationResult:
258
+ """Serialize the item's annotations."""
259
+ ...
260
+
242
261
  @abstractmethod
243
262
  def get_excluded_refs(self, **kwargs: Any) -> set[str]:
244
263
  """Get references to excluded items."""
@@ -257,3 +276,18 @@ class BaseSerializerProvider(ABC):
257
276
  def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
258
277
  """Get a the associated serializer."""
259
278
  ...
279
+
280
+
281
+ class BaseAnnotationSerializer(ABC):
282
+ """Base class for annotation serializers."""
283
+
284
+ @abstractmethod
285
+ def serialize(
286
+ self,
287
+ *,
288
+ item: DocItem,
289
+ doc: DoclingDocument,
290
+ **kwargs: Any,
291
+ ) -> SerializationResult:
292
+ """Serializes the passed annotation."""
293
+ ...
@@ -15,6 +15,7 @@ from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_fie
15
15
  from typing_extensions import Self, override
16
16
 
17
17
  from docling_core.transforms.serializer.base import (
18
+ BaseAnnotationSerializer,
18
19
  BaseDocSerializer,
19
20
  BaseFallbackSerializer,
20
21
  BaseFormSerializer,
@@ -30,6 +31,7 @@ from docling_core.transforms.serializer.base import (
30
31
  from docling_core.types.doc.document import (
31
32
  DOCUMENT_TOKENS_EXPORT_LABELS,
32
33
  ContentLayer,
34
+ DescriptionAnnotation,
33
35
  DocItem,
34
36
  DoclingDocument,
35
37
  FloatingItem,
@@ -41,9 +43,10 @@ from docling_core.types.doc.document import (
41
43
  OrderedList,
42
44
  PictureClassificationData,
43
45
  PictureDataType,
44
- PictureDescriptionData,
45
46
  PictureItem,
46
47
  PictureMoleculeData,
48
+ Script,
49
+ TableAnnotationType,
47
50
  TableItem,
48
51
  TextItem,
49
52
  UnorderedList,
@@ -122,7 +125,9 @@ def _iterate_items(
122
125
  yield item
123
126
 
124
127
 
125
- def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
128
+ def _get_annotation_text(
129
+ annotation: Union[PictureDataType, TableAnnotationType],
130
+ ) -> Optional[str]:
126
131
  result = None
127
132
  if isinstance(annotation, PictureClassificationData):
128
133
  predicted_class = (
@@ -132,7 +137,7 @@ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
132
137
  )
133
138
  if predicted_class is not None:
134
139
  result = predicted_class.replace("_", " ")
135
- elif isinstance(annotation, PictureDescriptionData):
140
+ elif isinstance(annotation, DescriptionAnnotation):
136
141
  result = annotation.text
137
142
  elif isinstance(annotation, PictureMoleculeData):
138
143
  result = annotation.smi
@@ -211,6 +216,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
211
216
  list_serializer: BaseListSerializer
212
217
  inline_serializer: BaseInlineSerializer
213
218
 
219
+ annotation_serializer: BaseAnnotationSerializer
220
+
214
221
  params: CommonParams = CommonParams()
215
222
 
216
223
  _excluded_refs_cache: dict[str, set[str]] = {}
@@ -449,6 +456,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
449
456
  res = self.serialize_underline(text=res)
450
457
  if formatting.strikethrough:
451
458
  res = self.serialize_strikethrough(text=res)
459
+ if formatting.script == Script.SUB:
460
+ res = self.serialize_subscript(text=res)
461
+ elif formatting.script == Script.SUPER:
462
+ res = self.serialize_superscript(text=res)
452
463
  if params.include_hyperlinks and hyperlink:
453
464
  res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
454
465
  return res
@@ -473,6 +484,16 @@ class DocSerializer(BaseModel, BaseDocSerializer):
473
484
  """Hook for strikethrough formatting serialization."""
474
485
  return text
475
486
 
487
+ @override
488
+ def serialize_subscript(self, text: str, **kwargs: Any) -> str:
489
+ """Hook for subscript formatting serialization."""
490
+ return text
491
+
492
+ @override
493
+ def serialize_superscript(self, text: str, **kwargs: Any) -> str:
494
+ """Hook for superscript formatting serialization."""
495
+ return text
496
+
476
497
  @override
477
498
  def serialize_hyperlink(
478
499
  self,
@@ -505,6 +526,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
505
526
  text_res = ""
506
527
  return create_ser_result(text=text_res, span_source=results)
507
528
 
529
+ @override
530
+ def serialize_annotations(
531
+ self,
532
+ item: DocItem,
533
+ **kwargs: Any,
534
+ ) -> SerializationResult:
535
+ """Serialize the item's annotations."""
536
+ return self.annotation_serializer.serialize(
537
+ item=item,
538
+ doc=self.doc,
539
+ **kwargs,
540
+ )
541
+
508
542
  def _get_applicable_pages(self) -> Optional[list[int]]:
509
543
  pages = {
510
544
  item.prov[0].page_no: ...
@@ -7,6 +7,7 @@ from pydantic import BaseModel
7
7
  from typing_extensions import override
8
8
 
9
9
  from docling_core.transforms.serializer.base import (
10
+ BaseAnnotationSerializer,
10
11
  BaseDocSerializer,
11
12
  BaseFallbackSerializer,
12
13
  BaseFormSerializer,
@@ -17,12 +18,14 @@ from docling_core.transforms.serializer.base import (
17
18
  BaseTableSerializer,
18
19
  BaseTextSerializer,
19
20
  SerializationResult,
21
+ Span,
20
22
  )
21
23
  from docling_core.transforms.serializer.common import (
22
24
  CommonParams,
23
25
  DocSerializer,
24
26
  create_ser_result,
25
27
  )
28
+ from docling_core.types.doc.base import BoundingBox
26
29
  from docling_core.types.doc.document import (
27
30
  CodeItem,
28
31
  DocItem,
@@ -38,6 +41,7 @@ from docling_core.types.doc.document import (
38
41
  PictureItem,
39
42
  PictureMoleculeData,
40
43
  PictureTabularChartData,
44
+ ProvenanceItem,
41
45
  TableItem,
42
46
  TextItem,
43
47
  UnorderedList,
@@ -414,6 +418,39 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
414
418
  class DocTagsInlineSerializer(BaseInlineSerializer):
415
419
  """DocTags-specific inline group serializer."""
416
420
 
421
+ def _get_inline_location_tags(
422
+ self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams
423
+ ) -> SerializationResult:
424
+
425
+ prov: Optional[ProvenanceItem] = None
426
+ boxes: list[BoundingBox] = []
427
+ doc_items: list[DocItem] = []
428
+ for it, _ in doc.iterate_items(root=item):
429
+ if isinstance(it, DocItem):
430
+ for prov in it.prov:
431
+ boxes.append(prov.bbox)
432
+ doc_items.append(it)
433
+ if prov is None:
434
+ return create_ser_result()
435
+
436
+ bbox = BoundingBox.enclosing_bbox(boxes=boxes)
437
+
438
+ # using last seen prov as reference for page dims
439
+ page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
440
+
441
+ loc_str = DocumentToken.get_location(
442
+ bbox=bbox.to_top_left_origin(page_h).as_tuple(),
443
+ page_w=page_w,
444
+ page_h=page_h,
445
+ xsize=params.xsize,
446
+ ysize=params.ysize,
447
+ )
448
+
449
+ return SerializationResult(
450
+ text=loc_str,
451
+ spans=[Span(item=it) for it in doc_items],
452
+ )
453
+
417
454
  @override
418
455
  def serialize(
419
456
  self,
@@ -428,12 +465,23 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
428
465
  """Serializes the passed item."""
429
466
  my_visited = visited if visited is not None else set()
430
467
  params = DocTagsParams(**kwargs)
431
- parts = doc_serializer.get_parts(
432
- item=item,
433
- list_level=list_level,
434
- is_inline_scope=True,
435
- visited=my_visited,
436
- **kwargs,
468
+ parts: List[SerializationResult] = []
469
+ if params.add_location:
470
+ inline_loc_tags_ser_res = self._get_inline_location_tags(
471
+ doc=doc,
472
+ item=item,
473
+ params=params,
474
+ )
475
+ parts.append(inline_loc_tags_ser_res)
476
+ params.add_location = False # suppress children location serialization
477
+ parts.extend(
478
+ doc_serializer.get_parts(
479
+ item=item,
480
+ list_level=list_level,
481
+ is_inline_scope=True,
482
+ visited=my_visited,
483
+ **{**kwargs, **params.model_dump()},
484
+ )
437
485
  )
438
486
  wrap_tag = DocumentToken.INLINE.value
439
487
  delim = _get_delim(params=params)
@@ -460,6 +508,15 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
460
508
  return create_ser_result()
461
509
 
462
510
 
511
+ class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
512
+ """DocTags-specific annotation serializer."""
513
+
514
+ @override
515
+ def serialize(self, *, item: DocItem, **kwargs: Any) -> SerializationResult:
516
+ """Serializes the item's annotations."""
517
+ return create_ser_result()
518
+
519
+
463
520
  class DocTagsDocSerializer(DocSerializer):
464
521
  """DocTags-specific document serializer."""
465
522
 
@@ -473,6 +530,8 @@ class DocTagsDocSerializer(DocSerializer):
473
530
  list_serializer: BaseListSerializer = DocTagsListSerializer()
474
531
  inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
475
532
 
533
+ annotation_serializer: BaseAnnotationSerializer = DocTagsAnnotationSerializer()
534
+
476
535
  params: DocTagsParams = DocTagsParams()
477
536
 
478
537
  @override
@@ -21,6 +21,7 @@ from pydantic import AnyUrl, BaseModel
21
21
  from typing_extensions import override
22
22
 
23
23
  from docling_core.transforms.serializer.base import (
24
+ BaseAnnotationSerializer,
24
25
  BaseDocSerializer,
25
26
  BaseFallbackSerializer,
26
27
  BaseFormSerializer,
@@ -35,7 +36,7 @@ from docling_core.transforms.serializer.base import (
35
36
  from docling_core.transforms.serializer.common import (
36
37
  CommonParams,
37
38
  DocSerializer,
38
- _get_picture_annotation_text,
39
+ _get_annotation_text,
39
40
  create_ser_result,
40
41
  )
41
42
  from docling_core.transforms.serializer.html_styles import (
@@ -47,6 +48,7 @@ from docling_core.types.doc.base import ImageRefMode
47
48
  from docling_core.types.doc.document import (
48
49
  CodeItem,
49
50
  ContentLayer,
51
+ DescriptionAnnotation,
50
52
  DocItem,
51
53
  DoclingDocument,
52
54
  FloatingItem,
@@ -59,7 +61,9 @@ from docling_core.types.doc.document import (
59
61
  ListItem,
60
62
  NodeItem,
61
63
  OrderedList,
64
+ PictureClassificationData,
62
65
  PictureItem,
66
+ PictureMoleculeData,
63
67
  PictureTabularChartData,
64
68
  SectionHeaderItem,
65
69
  TableCell,
@@ -758,14 +762,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
758
762
  """HTML-specific fallback serializer."""
759
763
 
760
764
  @override
761
- def serialize(
762
- self,
763
- *,
764
- item: NodeItem,
765
- doc_serializer: "BaseDocSerializer",
766
- doc: DoclingDocument,
767
- **kwargs: Any,
768
- ) -> SerializationResult:
765
+ def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
769
766
  """Fallback serializer for items not handled by other serializers."""
770
767
  if isinstance(item, DocItem):
771
768
  return create_ser_result(
@@ -777,6 +774,42 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
777
774
  return create_ser_result()
778
775
 
779
776
 
777
+ class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
778
+ """HTML-specific annotation serializer."""
779
+
780
+ def serialize(
781
+ self,
782
+ *,
783
+ item: DocItem,
784
+ doc: DoclingDocument,
785
+ **kwargs: Any,
786
+ ) -> SerializationResult:
787
+ """Serializes the passed annotation to HTML format."""
788
+ res_parts: list[SerializationResult] = []
789
+ for ann in item.get_annotations():
790
+ if isinstance(
791
+ ann,
792
+ (PictureClassificationData, DescriptionAnnotation, PictureMoleculeData),
793
+ ):
794
+ if ann_text := _get_annotation_text(ann):
795
+ text_dir = get_text_direction(ann_text)
796
+ dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
797
+ ann_ser_res = create_ser_result(
798
+ text=(
799
+ f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
800
+ f"{html.escape(ann_text)}"
801
+ f"</div>"
802
+ ),
803
+ span_source=item,
804
+ )
805
+ res_parts.append(ann_ser_res)
806
+
807
+ return create_ser_result(
808
+ text=" ".join([r.text for r in res_parts if r.text]),
809
+ span_source=res_parts,
810
+ )
811
+
812
+
780
813
  class HTMLDocSerializer(DocSerializer):
781
814
  """HTML-specific document serializer."""
782
815
 
@@ -790,6 +823,8 @@ class HTMLDocSerializer(DocSerializer):
790
823
  list_serializer: BaseListSerializer = HTMLListSerializer()
791
824
  inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
792
825
 
826
+ annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
827
+
793
828
  params: HTMLParams = HTMLParams()
794
829
 
795
830
  @override
@@ -812,6 +847,16 @@ class HTMLDocSerializer(DocSerializer):
812
847
  """Apply HTML-specific strikethrough serialization."""
813
848
  return f"<del>{text}</del>"
814
849
 
850
+ @override
851
+ def serialize_subscript(self, text: str, **kwargs: Any) -> str:
852
+ """Apply HTML-specific subscript serialization."""
853
+ return f"<sub>{text}</sub>"
854
+
855
+ @override
856
+ def serialize_superscript(self, text: str, **kwargs: Any) -> str:
857
+ """Apply HTML-specific superscript serialization."""
858
+ return f"<sup>{text}</sup>"
859
+
815
860
  @override
816
861
  def serialize_hyperlink(
817
862
  self,
@@ -968,20 +1013,13 @@ class HTMLDocSerializer(DocSerializer):
968
1013
  results.append(cap_ser_res)
969
1014
 
970
1015
  if params.include_annotations and item.self_ref not in excluded_refs:
971
- if isinstance(item, PictureItem):
972
- for ann in item.annotations:
973
- if ann_text := _get_picture_annotation_text(annotation=ann):
974
- text_dir = get_text_direction(ann_text)
975
- dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
976
- ann_ser_res = create_ser_result(
977
- text=(
978
- f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
979
- f"{html.escape(ann_text)}"
980
- f"</div>"
981
- ),
982
- span_source=item,
983
- )
984
- results.append(ann_ser_res)
1016
+ if isinstance(item, (PictureItem, TableItem)):
1017
+ ann_res = self.serialize_annotations(
1018
+ item=item,
1019
+ **kwargs,
1020
+ )
1021
+ if ann_res.text:
1022
+ results.append(ann_res)
985
1023
 
986
1024
  text_res = params.caption_delim.join([r.text for r in results])
987
1025
  if text_res:
@@ -15,6 +15,7 @@ from tabulate import tabulate
15
15
  from typing_extensions import override
16
16
 
17
17
  from docling_core.transforms.serializer.base import (
18
+ BaseAnnotationSerializer,
18
19
  BaseDocSerializer,
19
20
  BaseFallbackSerializer,
20
21
  BaseFormSerializer,
@@ -29,7 +30,7 @@ from docling_core.transforms.serializer.base import (
29
30
  from docling_core.transforms.serializer.common import (
30
31
  CommonParams,
31
32
  DocSerializer,
32
- _get_picture_annotation_text,
33
+ _get_annotation_text,
33
34
  _PageBreakSerResult,
34
35
  create_ser_result,
35
36
  )
@@ -37,6 +38,7 @@ from docling_core.types.doc.base import ImageRefMode
37
38
  from docling_core.types.doc.document import (
38
39
  CodeItem,
39
40
  ContentLayer,
41
+ DescriptionAnnotation,
40
42
  DocItem,
41
43
  DoclingDocument,
42
44
  FloatingItem,
@@ -48,7 +50,9 @@ from docling_core.types.doc.document import (
48
50
  KeyValueItem,
49
51
  NodeItem,
50
52
  OrderedList,
53
+ PictureClassificationData,
51
54
  PictureItem,
55
+ PictureMoleculeData,
52
56
  PictureTabularChartData,
53
57
  SectionHeaderItem,
54
58
  TableItem,
@@ -58,6 +62,23 @@ from docling_core.types.doc.document import (
58
62
  )
59
63
 
60
64
 
65
+ def _get_annotation_ser_result(
66
+ ann_kind: str, ann_text: str, mark_annotation: bool, doc_item: DocItem
67
+ ):
68
+ return create_ser_result(
69
+ text=(
70
+ (
71
+ f'<!--<annotation kind="{ann_kind}">-->'
72
+ f"{ann_text}"
73
+ f"<!--<annotation/>-->"
74
+ )
75
+ if mark_annotation
76
+ else ann_text
77
+ ),
78
+ span_source=doc_item,
79
+ )
80
+
81
+
61
82
  class MarkdownParams(CommonParams):
62
83
  """Markdown-specific serialization parameters."""
63
84
 
@@ -136,6 +157,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
136
157
  return create_ser_result(text=text, span_source=res_parts)
137
158
 
138
159
 
160
+ class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
161
+ """Markdown-specific annotation serializer."""
162
+
163
+ def serialize(
164
+ self,
165
+ *,
166
+ item: DocItem,
167
+ doc: DoclingDocument,
168
+ **kwargs: Any,
169
+ ) -> SerializationResult:
170
+ """Serialize the item's annotations."""
171
+ params = MarkdownParams(**kwargs)
172
+
173
+ res_parts: list[SerializationResult] = []
174
+ for ann in item.get_annotations():
175
+ if isinstance(
176
+ ann,
177
+ (
178
+ PictureClassificationData,
179
+ DescriptionAnnotation,
180
+ PictureMoleculeData,
181
+ ),
182
+ ):
183
+ if ann_text := _get_annotation_text(ann):
184
+ ann_res = create_ser_result(
185
+ text=(
186
+ (
187
+ f'<!--<annotation kind="{ann.kind}">-->'
188
+ f"{ann_text}"
189
+ f"<!--<annotation/>-->"
190
+ )
191
+ if params.mark_annotations
192
+ else ann_text
193
+ ),
194
+ span_source=item,
195
+ )
196
+ res_parts.append(ann_res)
197
+ return create_ser_result(
198
+ text="\n\n".join([r.text for r in res_parts if r.text]),
199
+ span_source=item,
200
+ )
201
+
202
+
139
203
  class MarkdownTableSerializer(BaseTableSerializer):
140
204
  """Markdown-specific table item serializer."""
141
205
 
@@ -149,6 +213,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
149
213
  **kwargs: Any,
150
214
  ) -> SerializationResult:
151
215
  """Serializes the passed item."""
216
+ params = MarkdownParams(**kwargs)
152
217
  res_parts: list[SerializationResult] = []
153
218
 
154
219
  cap_res = doc_serializer.serialize_captions(
@@ -159,6 +224,16 @@ class MarkdownTableSerializer(BaseTableSerializer):
159
224
  res_parts.append(cap_res)
160
225
 
161
226
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
227
+
228
+ if params.include_annotations:
229
+
230
+ ann_res = doc_serializer.serialize_annotations(
231
+ item=item,
232
+ **kwargs,
233
+ )
234
+ if ann_res.text:
235
+ res_parts.append(ann_res)
236
+
162
237
  rows = [
163
238
  [
164
239
  # make sure that md tables are not broken
@@ -214,22 +289,12 @@ class MarkdownPictureSerializer(BasePictureSerializer):
214
289
 
215
290
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
216
291
  if params.include_annotations:
217
-
218
- for ann in item.annotations:
219
- if ann_text := _get_picture_annotation_text(annotation=ann):
220
- ann_ser_res = create_ser_result(
221
- text=(
222
- (
223
- f'<!--<annotation kind="{ann.kind}">-->'
224
- f"{ann_text}"
225
- f"<!--<annotation/>-->"
226
- )
227
- if params.mark_annotations
228
- else ann_text
229
- ),
230
- span_source=item,
231
- )
232
- res_parts.append(ann_ser_res)
292
+ ann_res = doc_serializer.serialize_annotations(
293
+ item=item,
294
+ **kwargs,
295
+ )
296
+ if ann_res.text:
297
+ res_parts.append(ann_res)
233
298
 
234
299
  img_res = self._serialize_image_part(
235
300
  item=item,
@@ -257,7 +322,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
257
322
  res_parts.append(
258
323
  create_ser_result(text=md_table_content, span_source=item)
259
324
  )
260
- text_res = "\n\n".join([r.text for r in res_parts])
325
+ text_res = "\n\n".join([r.text for r in res_parts if r.text])
261
326
 
262
327
  return create_ser_result(text=text_res, span_source=res_parts)
263
328
 
@@ -471,6 +536,8 @@ class MarkdownDocSerializer(DocSerializer):
471
536
  list_serializer: BaseListSerializer = MarkdownListSerializer()
472
537
  inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
473
538
 
539
+ annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
540
+
474
541
  params: MarkdownParams = MarkdownParams()
475
542
 
476
543
  @override