docling-core 2.26.4__tar.gz → 2.28.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (79) hide show
  1. {docling_core-2.26.4 → docling_core-2.28.0}/PKG-INFO +4 -2
  2. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/base.py +25 -19
  3. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/common.py +17 -11
  4. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/doctags.py +14 -11
  5. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/html.py +48 -17
  6. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/markdown.py +24 -16
  7. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/hybrid_chunker.py +49 -31
  8. docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/__init__.py +1 -0
  9. docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/base.py +25 -0
  10. docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/huggingface.py +70 -0
  11. docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/openai.py +34 -0
  12. docling_core-2.28.0/docling_core/transforms/visualizer/__init__.py +1 -0
  13. docling_core-2.28.0/docling_core/transforms/visualizer/base.py +23 -0
  14. docling_core-2.28.0/docling_core/transforms/visualizer/layout_visualizer.py +201 -0
  15. docling_core-2.28.0/docling_core/transforms/visualizer/reading_order_visualizer.py +149 -0
  16. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/document.py +26 -2
  17. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/labels.py +2 -1
  18. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/page.py +4 -3
  19. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/document.py +2 -2
  20. {docling_core-2.26.4 → docling_core-2.28.0}/pyproject.toml +11 -2
  21. {docling_core-2.26.4 → docling_core-2.28.0}/LICENSE +0 -0
  22. {docling_core-2.26.4 → docling_core-2.28.0}/README.md +0 -0
  23. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/__init__.py +0 -0
  24. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/cli/__init__.py +0 -0
  25. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/cli/view.py +0 -0
  26. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/__init__.py +0 -0
  27. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/__init__.py +0 -0
  28. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/html_styles.py +0 -0
  29. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/py.typed +0 -0
  30. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  31. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  32. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  33. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  34. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  35. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  36. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  37. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  38. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/__init__.py +0 -0
  39. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  40. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/mapping.py +0 -0
  41. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/meta.py +0 -0
  42. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/package.py +0 -0
  43. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/__init__.py +0 -0
  44. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/__init__.py +0 -0
  45. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/base.py +0 -0
  46. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  47. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/__init__.py +0 -0
  48. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/base.py +0 -0
  49. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/__init__.py +0 -0
  50. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/base.py +0 -0
  51. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/tokens.py +0 -0
  52. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/utils.py +0 -0
  53. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/gen/__init__.py +0 -0
  54. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/gen/generic.py +0 -0
  55. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/io/__init__.py +0 -0
  56. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  57. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/base.py +0 -0
  58. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  59. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  60. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  61. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  62. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/nlp/__init__.py +0 -0
  63. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/nlp/qa.py +0 -0
  64. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/nlp/qa_labels.py +0 -0
  65. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/__init__.py +0 -0
  66. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/attribute.py +0 -0
  67. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/base.py +0 -0
  68. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/predicate.py +0 -0
  69. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/record.py +0 -0
  70. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/statement.py +0 -0
  71. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/subject.py +0 -0
  72. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/__init__.py +0 -0
  73. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/alias.py +0 -0
  74. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/file.py +0 -0
  75. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/generate_docs.py +0 -0
  76. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/generate_jsonschema.py +0 -0
  77. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/legacy.py +0 -0
  78. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/validate.py +0 -0
  79. {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.26.4
3
+ Version: 2.28.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -26,6 +26,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Classifier: Typing :: Typed
28
28
  Provides-Extra: chunking
29
+ Provides-Extra: chunking-openai
29
30
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
31
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
32
  Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
@@ -33,8 +34,9 @@ Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
34
  Requires-Dist: pillow (>=10.0.0,<12.0.0)
34
35
  Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
35
36
  Requires-Dist: pyyaml (>=5.1,<7.0.0)
36
- Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
37
+ Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking" or extra == "chunking-openai"
37
38
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
39
+ Requires-Dist: tiktoken (>=0.9.0,<0.10.0) ; extra == "chunking-openai"
38
40
  Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
39
41
  Requires-Dist: typer (>=0.12.5,<0.16.0)
40
42
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
@@ -6,7 +6,7 @@
6
6
  """Define base classes for serialization."""
7
7
  from abc import ABC, abstractmethod
8
8
  from pathlib import Path
9
- from typing import Optional, Union
9
+ from typing import Any, Optional, Union
10
10
 
11
11
  from pydantic import AnyUrl, BaseModel
12
12
 
@@ -51,7 +51,7 @@ class BaseTextSerializer(ABC):
51
51
  item: TextItem,
52
52
  doc_serializer: "BaseDocSerializer",
53
53
  doc: DoclingDocument,
54
- **kwargs,
54
+ **kwargs: Any,
55
55
  ) -> SerializationResult:
56
56
  """Serializes the passed item."""
57
57
  ...
@@ -67,7 +67,7 @@ class BaseTableSerializer(ABC):
67
67
  item: TableItem,
68
68
  doc_serializer: "BaseDocSerializer",
69
69
  doc: DoclingDocument,
70
- **kwargs,
70
+ **kwargs: Any,
71
71
  ) -> SerializationResult:
72
72
  """Serializes the passed item."""
73
73
  ...
@@ -83,7 +83,7 @@ class BasePictureSerializer(ABC):
83
83
  item: PictureItem,
84
84
  doc_serializer: "BaseDocSerializer",
85
85
  doc: DoclingDocument,
86
- **kwargs,
86
+ **kwargs: Any,
87
87
  ) -> SerializationResult:
88
88
  """Serializes the passed item."""
89
89
  ...
@@ -99,7 +99,7 @@ class BaseKeyValueSerializer(ABC):
99
99
  item: KeyValueItem,
100
100
  doc_serializer: "BaseDocSerializer",
101
101
  doc: DoclingDocument,
102
- **kwargs,
102
+ **kwargs: Any,
103
103
  ) -> SerializationResult:
104
104
  """Serializes the passed item."""
105
105
  ...
@@ -115,7 +115,7 @@ class BaseFormSerializer(ABC):
115
115
  item: FormItem,
116
116
  doc_serializer: "BaseDocSerializer",
117
117
  doc: DoclingDocument,
118
- **kwargs,
118
+ **kwargs: Any,
119
119
  ) -> SerializationResult:
120
120
  """Serializes the passed item."""
121
121
  ...
@@ -131,7 +131,7 @@ class BaseListSerializer(ABC):
131
131
  item: Union[UnorderedList, OrderedList],
132
132
  doc_serializer: "BaseDocSerializer",
133
133
  doc: DoclingDocument,
134
- **kwargs,
134
+ **kwargs: Any,
135
135
  ) -> SerializationResult:
136
136
  """Serializes the passed item."""
137
137
  ...
@@ -147,7 +147,7 @@ class BaseInlineSerializer(ABC):
147
147
  item: InlineGroup,
148
148
  doc_serializer: "BaseDocSerializer",
149
149
  doc: DoclingDocument,
150
- **kwargs,
150
+ **kwargs: Any,
151
151
  ) -> SerializationResult:
152
152
  """Serializes the passed item."""
153
153
  ...
@@ -163,7 +163,7 @@ class BaseFallbackSerializer(ABC):
163
163
  item: NodeItem,
164
164
  doc_serializer: "BaseDocSerializer",
165
165
  doc: DoclingDocument,
166
- **kwargs,
166
+ **kwargs: Any,
167
167
  ) -> SerializationResult:
168
168
  """Serializes the passed item."""
169
169
  ...
@@ -174,34 +174,40 @@ class BaseDocSerializer(ABC):
174
174
 
175
175
  @abstractmethod
176
176
  def serialize(
177
- self, *, item: Optional[NodeItem] = None, **kwargs
177
+ self,
178
+ *,
179
+ item: Optional[NodeItem] = None,
180
+ **kwargs: Any,
178
181
  ) -> SerializationResult:
179
182
  """Run the serialization."""
180
183
  ...
181
184
 
182
185
  @abstractmethod
183
- def serialize_bold(self, text: str, **kwargs) -> str:
186
+ def serialize_bold(self, text: str, **kwargs: Any) -> str:
184
187
  """Hook for bold formatting serialization."""
185
188
  ...
186
189
 
187
190
  @abstractmethod
188
- def serialize_italic(self, text: str, **kwargs) -> str:
191
+ def serialize_italic(self, text: str, **kwargs: Any) -> str:
189
192
  """Hook for italic formatting serialization."""
190
193
  ...
191
194
 
192
195
  @abstractmethod
193
- def serialize_underline(self, text: str, **kwargs) -> str:
196
+ def serialize_underline(self, text: str, **kwargs: Any) -> str:
194
197
  """Hook for underline formatting serialization."""
195
198
  ...
196
199
 
197
200
  @abstractmethod
198
- def serialize_strikethrough(self, text: str, **kwargs) -> str:
201
+ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
199
202
  """Hook for strikethrough formatting serialization."""
200
203
  ...
201
204
 
202
205
  @abstractmethod
203
206
  def serialize_hyperlink(
204
- self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
207
+ self,
208
+ text: str,
209
+ hyperlink: Union[AnyUrl, Path],
210
+ **kwargs: Any,
205
211
  ) -> str:
206
212
  """Hook for hyperlink serialization."""
207
213
  ...
@@ -210,7 +216,7 @@ class BaseDocSerializer(ABC):
210
216
  def get_parts(
211
217
  self,
212
218
  item: Optional[NodeItem] = None,
213
- **kwargs,
219
+ **kwargs: Any,
214
220
  ) -> list[SerializationResult]:
215
221
  """Get the components to be combined for serializing this node."""
216
222
  ...
@@ -219,7 +225,7 @@ class BaseDocSerializer(ABC):
219
225
  def post_process(
220
226
  self,
221
227
  text: str,
222
- **kwargs,
228
+ **kwargs: Any,
223
229
  ) -> str:
224
230
  """Apply some text post-processing steps."""
225
231
  ...
@@ -228,13 +234,13 @@ class BaseDocSerializer(ABC):
228
234
  def serialize_captions(
229
235
  self,
230
236
  item: FloatingItem,
231
- **kwargs,
237
+ **kwargs: Any,
232
238
  ) -> SerializationResult:
233
239
  """Serialize the item's captions."""
234
240
  ...
235
241
 
236
242
  @abstractmethod
237
- def get_excluded_refs(self, **kwargs) -> set[str]:
243
+ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
238
244
  """Get references to excluded items."""
239
245
  ...
240
246
 
@@ -214,7 +214,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
214
214
  return refs
215
215
 
216
216
  @override
217
- def get_excluded_refs(self, **kwargs) -> set[str]:
217
+ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
218
218
  """References to excluded items."""
219
219
  params = self.params.merge_with_patch(patch=kwargs)
220
220
  params_json = params.model_dump_json()
@@ -252,7 +252,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
252
252
 
253
253
  @abstractmethod
254
254
  def serialize_doc(
255
- self, *, parts: list[SerializationResult], **kwargs
255
+ self,
256
+ *,
257
+ parts: list[SerializationResult],
258
+ **kwargs: Any,
256
259
  ) -> SerializationResult:
257
260
  """Serialize a document out of its pages."""
258
261
  ...
@@ -271,7 +274,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
271
274
  list_level: int = 0,
272
275
  is_inline_scope: bool = False,
273
276
  visited: Optional[set[str]] = None, # refs of visited items
274
- **kwargs,
277
+ **kwargs: Any,
275
278
  ) -> SerializationResult:
276
279
  """Serialize a given node."""
277
280
  my_visited: set[str] = visited if visited is not None else set()
@@ -380,7 +383,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
380
383
  list_level: int = 0,
381
384
  is_inline_scope: bool = False,
382
385
  visited: Optional[set[str]] = None, # refs of visited items
383
- **kwargs,
386
+ **kwargs: Any,
384
387
  ) -> list[SerializationResult]:
385
388
  """Get the components to be combined for serializing this node."""
386
389
  parts: list[SerializationResult] = []
@@ -415,7 +418,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
415
418
  *,
416
419
  formatting: Optional[Formatting] = None,
417
420
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
418
- **kwargs,
421
+ **kwargs: Any,
419
422
  ) -> str:
420
423
  """Apply some text post-processing steps."""
421
424
  params = self.params.merge_with_patch(patch=kwargs)
@@ -434,28 +437,31 @@ class DocSerializer(BaseModel, BaseDocSerializer):
434
437
  return res
435
438
 
436
439
  @override
437
- def serialize_bold(self, text: str, **kwargs) -> str:
440
+ def serialize_bold(self, text: str, **kwargs: Any) -> str:
438
441
  """Hook for bold formatting serialization."""
439
442
  return text
440
443
 
441
444
  @override
442
- def serialize_italic(self, text: str, **kwargs) -> str:
445
+ def serialize_italic(self, text: str, **kwargs: Any) -> str:
443
446
  """Hook for italic formatting serialization."""
444
447
  return text
445
448
 
446
449
  @override
447
- def serialize_underline(self, text: str, **kwargs) -> str:
450
+ def serialize_underline(self, text: str, **kwargs: Any) -> str:
448
451
  """Hook for underline formatting serialization."""
449
452
  return text
450
453
 
451
454
  @override
452
- def serialize_strikethrough(self, text: str, **kwargs) -> str:
455
+ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
453
456
  """Hook for strikethrough formatting serialization."""
454
457
  return text
455
458
 
456
459
  @override
457
460
  def serialize_hyperlink(
458
- self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
461
+ self,
462
+ text: str,
463
+ hyperlink: Union[AnyUrl, Path],
464
+ **kwargs: Any,
459
465
  ) -> str:
460
466
  """Hook for hyperlink serialization."""
461
467
  return text
@@ -464,7 +470,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
464
470
  def serialize_captions(
465
471
  self,
466
472
  item: FloatingItem,
467
- **kwargs,
473
+ **kwargs: Any,
468
474
  ) -> SerializationResult:
469
475
  """Serialize the item's captions."""
470
476
  params = self.params.merge_with_patch(patch=kwargs)
@@ -1,7 +1,7 @@
1
1
  """Define classes for Doctags serialization."""
2
2
 
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
  from typing_extensions import override
@@ -91,7 +91,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
91
91
  item: TextItem,
92
92
  doc_serializer: BaseDocSerializer,
93
93
  doc: DoclingDocument,
94
- **kwargs,
94
+ **kwargs: Any,
95
95
  ) -> SerializationResult:
96
96
  """Serializes the passed item."""
97
97
  from docling_core.types.doc.document import SectionHeaderItem
@@ -154,7 +154,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
154
154
  item: TableItem,
155
155
  doc_serializer: BaseDocSerializer,
156
156
  doc: DoclingDocument,
157
- **kwargs,
157
+ **kwargs: Any,
158
158
  ) -> SerializationResult:
159
159
  """Serializes the passed item."""
160
160
  params = DocTagsParams(**kwargs)
@@ -201,7 +201,7 @@ class DocTagsPictureSerializer(BasePictureSerializer):
201
201
  item: PictureItem,
202
202
  doc_serializer: BaseDocSerializer,
203
203
  doc: DoclingDocument,
204
- **kwargs,
204
+ **kwargs: Any,
205
205
  ) -> SerializationResult:
206
206
  """Serializes the passed item."""
207
207
  params = DocTagsParams(**kwargs)
@@ -284,7 +284,7 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
284
284
  item: KeyValueItem,
285
285
  doc_serializer: "BaseDocSerializer",
286
286
  doc: DoclingDocument,
287
- **kwargs,
287
+ **kwargs: Any,
288
288
  ) -> SerializationResult:
289
289
  """Serializes the passed item."""
290
290
  params = DocTagsParams(**kwargs)
@@ -356,7 +356,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
356
356
  item: FormItem,
357
357
  doc_serializer: "BaseDocSerializer",
358
358
  doc: DoclingDocument,
359
- **kwargs,
359
+ **kwargs: Any,
360
360
  ) -> SerializationResult:
361
361
  """Serializes the passed item."""
362
362
  # TODO add actual implementation
@@ -378,7 +378,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
378
378
  list_level: int = 0,
379
379
  is_inline_scope: bool = False,
380
380
  visited: Optional[set[str]] = None, # refs of visited items
381
- **kwargs,
381
+ **kwargs: Any,
382
382
  ) -> SerializationResult:
383
383
  """Serializes the passed item."""
384
384
  my_visited = visited if visited is not None else set()
@@ -423,7 +423,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
423
423
  doc: DoclingDocument,
424
424
  list_level: int = 0,
425
425
  visited: Optional[set[str]] = None, # refs of visited items
426
- **kwargs,
426
+ **kwargs: Any,
427
427
  ) -> SerializationResult:
428
428
  """Serializes the passed item."""
429
429
  my_visited = visited if visited is not None else set()
@@ -454,7 +454,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
454
454
  item: NodeItem,
455
455
  doc_serializer: "BaseDocSerializer",
456
456
  doc: DoclingDocument,
457
- **kwargs,
457
+ **kwargs: Any,
458
458
  ) -> SerializationResult:
459
459
  """Serializes the passed item."""
460
460
  return create_ser_result()
@@ -477,7 +477,10 @@ class DocTagsDocSerializer(DocSerializer):
477
477
 
478
478
  @override
479
479
  def serialize_doc(
480
- self, *, parts: list[SerializationResult], **kwargs
480
+ self,
481
+ *,
482
+ parts: list[SerializationResult],
483
+ **kwargs: Any,
481
484
  ) -> SerializationResult:
482
485
  """Serialize a document out of its pages."""
483
486
  delim = _get_delim(params=self.params)
@@ -496,7 +499,7 @@ class DocTagsDocSerializer(DocSerializer):
496
499
  def serialize_captions(
497
500
  self,
498
501
  item: FloatingItem,
499
- **kwargs,
502
+ **kwargs: Any,
500
503
  ) -> SerializationResult:
501
504
  """Serialize the item's captions."""
502
505
  params = DocTagsParams(**kwargs)
@@ -10,7 +10,7 @@ import logging
10
10
  from enum import Enum
11
11
  from io import BytesIO
12
12
  from pathlib import Path
13
- from typing import Optional, Union
13
+ from typing import Any, Optional, Union
14
14
  from urllib.parse import quote
15
15
  from xml.etree.cElementTree import SubElement, tostring
16
16
  from xml.sax.saxutils import unescape
@@ -57,6 +57,7 @@ from docling_core.types.doc.document import (
57
57
  NodeItem,
58
58
  OrderedList,
59
59
  PictureItem,
60
+ PictureTabularChartData,
60
61
  SectionHeaderItem,
61
62
  TableCell,
62
63
  TableItem,
@@ -104,6 +105,9 @@ class HTMLParams(CommonParams):
104
105
  # Allow for different output styles
105
106
  output_style: HTMLOutputStyle = HTMLOutputStyle.SINGLE_COLUMN
106
107
 
108
+ # Enable charts to be printed into HTML as tables
109
+ enable_chart_tables: bool = True
110
+
107
111
 
108
112
  class HTMLTextSerializer(BaseModel, BaseTextSerializer):
109
113
  """HTML-specific text item serializer."""
@@ -116,7 +120,7 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
116
120
  doc_serializer: BaseDocSerializer,
117
121
  doc: DoclingDocument,
118
122
  is_inline_scope: bool = False,
119
- **kwargs,
123
+ **kwargs: Any,
120
124
  ) -> SerializationResult:
121
125
  """Serializes the passed text item to HTML."""
122
126
  params = HTMLParams(**kwargs)
@@ -292,7 +296,7 @@ class HTMLTableSerializer(BaseTableSerializer):
292
296
  item: TableItem,
293
297
  doc_serializer: BaseDocSerializer,
294
298
  doc: DoclingDocument,
295
- **kwargs,
299
+ **kwargs: Any,
296
300
  ) -> SerializationResult:
297
301
  """Serializes the passed table item to HTML."""
298
302
  nrows = item.data.num_rows
@@ -363,7 +367,7 @@ class HTMLPictureSerializer(BasePictureSerializer):
363
367
  item: PictureItem,
364
368
  doc_serializer: BaseDocSerializer,
365
369
  doc: DoclingDocument,
366
- **kwargs,
370
+ **kwargs: Any,
367
371
  ) -> SerializationResult:
368
372
  """Export picture to HTML format."""
369
373
  params = HTMLParams(**kwargs)
@@ -402,9 +406,28 @@ class HTMLPictureSerializer(BasePictureSerializer):
402
406
  and item.image.uri.scheme == "data"
403
407
  ):
404
408
  img_text = f'<img src="{quote(str(item.image.uri))}">'
409
+
405
410
  if img_text:
406
411
  res_parts.append(create_ser_result(text=img_text, span_source=item))
407
412
 
413
+ if params.enable_chart_tables:
414
+ # Check if picture has attached PictureTabularChartData
415
+ tabular_chart_annotations = [
416
+ ann
417
+ for ann in item.annotations
418
+ if isinstance(ann, PictureTabularChartData)
419
+ ]
420
+ if len(tabular_chart_annotations) > 0:
421
+ temp_doc = DoclingDocument(name="temp")
422
+ temp_table = temp_doc.add_table(
423
+ data=tabular_chart_annotations[0].chart_data
424
+ )
425
+ html_table_content = temp_table.export_to_html(temp_doc)
426
+ if len(html_table_content) > 0:
427
+ res_parts.append(
428
+ create_ser_result(text=html_table_content, span_source=item)
429
+ )
430
+
408
431
  text_res = "".join([r.text for r in res_parts])
409
432
  if text_res:
410
433
  text_res = f"<figure>{text_res}</figure>"
@@ -551,7 +574,7 @@ class HTMLKeyValueSerializer(BaseKeyValueSerializer):
551
574
  item: KeyValueItem,
552
575
  doc_serializer: "BaseDocSerializer",
553
576
  doc: DoclingDocument,
554
- **kwargs,
577
+ **kwargs: Any,
555
578
  ) -> SerializationResult:
556
579
  """Serializes the passed key-value item to HTML."""
557
580
  res_parts: list[SerializationResult] = []
@@ -588,7 +611,7 @@ class HTMLFormSerializer(BaseFormSerializer):
588
611
  item: FormItem,
589
612
  doc_serializer: "BaseDocSerializer",
590
613
  doc: DoclingDocument,
591
- **kwargs,
614
+ **kwargs: Any,
592
615
  ) -> SerializationResult:
593
616
  """Serializes the passed form item to HTML."""
594
617
  res_parts: list[SerializationResult] = []
@@ -628,7 +651,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
628
651
  list_level: int = 0,
629
652
  is_inline_scope: bool = False,
630
653
  visited: Optional[set[str]] = None, # refs of visited items
631
- **kwargs,
654
+ **kwargs: Any,
632
655
  ) -> SerializationResult:
633
656
  """Serializes a list to HTML."""
634
657
  my_visited: set[str] = visited if visited is not None else set()
@@ -676,7 +699,7 @@ class HTMLInlineSerializer(BaseInlineSerializer):
676
699
  doc: DoclingDocument,
677
700
  list_level: int = 0,
678
701
  visited: Optional[set[str]] = None, # refs of visited items
679
- **kwargs,
702
+ **kwargs: Any,
680
703
  ) -> SerializationResult:
681
704
  """Serializes an inline group to HTML."""
682
705
  my_visited: set[str] = visited if visited is not None else set()
@@ -710,7 +733,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
710
733
  item: NodeItem,
711
734
  doc_serializer: "BaseDocSerializer",
712
735
  doc: DoclingDocument,
713
- **kwargs,
736
+ **kwargs: Any,
714
737
  ) -> SerializationResult:
715
738
  """Fallback serializer for items not handled by other serializers."""
716
739
  if isinstance(item, DocItem):
@@ -739,35 +762,40 @@ class HTMLDocSerializer(DocSerializer):
739
762
  params: HTMLParams = HTMLParams()
740
763
 
741
764
  @override
742
- def serialize_bold(self, text: str, **kwargs) -> str:
765
+ def serialize_bold(self, text: str, **kwargs: Any) -> str:
743
766
  """Apply HTML-specific bold serialization."""
744
767
  return f"<strong>{text}</strong>"
745
768
 
746
769
  @override
747
- def serialize_italic(self, text: str, **kwargs) -> str:
770
+ def serialize_italic(self, text: str, **kwargs: Any) -> str:
748
771
  """Apply HTML-specific italic serialization."""
749
772
  return f"<em>{text}</em>"
750
773
 
751
774
  @override
752
- def serialize_underline(self, text: str, **kwargs) -> str:
775
+ def serialize_underline(self, text: str, **kwargs: Any) -> str:
753
776
  """Apply HTML-specific underline serialization."""
754
777
  return f"<u>{text}</u>"
755
778
 
756
779
  @override
757
- def serialize_strikethrough(self, text: str, **kwargs) -> str:
780
+ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
758
781
  """Apply HTML-specific strikethrough serialization."""
759
782
  return f"<del>{text}</del>"
760
783
 
761
784
  @override
762
785
  def serialize_hyperlink(
763
- self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
786
+ self,
787
+ text: str,
788
+ hyperlink: Union[AnyUrl, Path],
789
+ **kwargs: Any,
764
790
  ) -> str:
765
791
  """Apply HTML-specific hyperlink serialization."""
766
792
  return f'<a href="{str(hyperlink)}">{text}</a>'
767
793
 
768
794
  @override
769
795
  def serialize_doc(
770
- self, parts: list[SerializationResult], **kwargs
796
+ self,
797
+ parts: list[SerializationResult],
798
+ **kwargs: Any,
771
799
  ) -> SerializationResult:
772
800
  """Serialize a document out of its pages."""
773
801
  # Create HTML structure
@@ -779,6 +807,8 @@ class HTMLDocSerializer(DocSerializer):
779
807
  ]
780
808
 
781
809
  if self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
810
+ applicable_pages = self._get_applicable_pages()
811
+
782
812
  html_content = "\n".join([p.text for p in parts if p.text])
783
813
  next_page: Optional[int] = None
784
814
  prev_full_match_end = 0
@@ -791,11 +821,12 @@ class HTMLDocSerializer(DocSerializer):
791
821
  # capture last page
792
822
  if next_page is not None:
793
823
  pages[next_page] = html_content[prev_full_match_end:]
824
+ elif applicable_pages is not None and len(applicable_pages) == 1:
825
+ pages[applicable_pages[0]] = html_content
794
826
 
795
827
  html_parts.append("<table>")
796
828
  html_parts.append("<tbody>")
797
829
 
798
- applicable_pages = self._get_applicable_pages()
799
830
  for page_no, page in pages.items():
800
831
 
801
832
  if isinstance(page_no, int):
@@ -869,7 +900,7 @@ class HTMLDocSerializer(DocSerializer):
869
900
  self,
870
901
  item: FloatingItem,
871
902
  tag: str = "figcaption",
872
- **kwargs,
903
+ **kwargs: Any,
873
904
  ) -> SerializationResult:
874
905
  """Serialize the item's captions."""
875
906
  params = self.params.merge_with_patch(patch=kwargs)