docling-core 2.27.0__tar.gz → 2.28.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (79) hide show
  1. {docling_core-2.27.0 → docling_core-2.28.1}/PKG-INFO +4 -2
  2. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/base.py +25 -19
  3. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/common.py +17 -11
  4. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/doctags.py +14 -11
  5. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/html.py +21 -16
  6. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/markdown.py +24 -16
  7. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/hybrid_chunker.py +49 -31
  8. docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/__init__.py +1 -0
  9. docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/base.py +25 -0
  10. docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/huggingface.py +70 -0
  11. docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/openai.py +34 -0
  12. docling_core-2.28.1/docling_core/transforms/visualizer/__init__.py +1 -0
  13. docling_core-2.28.1/docling_core/transforms/visualizer/base.py +23 -0
  14. docling_core-2.28.1/docling_core/transforms/visualizer/layout_visualizer.py +212 -0
  15. docling_core-2.28.1/docling_core/transforms/visualizer/reading_order_visualizer.py +149 -0
  16. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/document.py +25 -3
  17. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/page.py +4 -3
  18. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/document.py +2 -2
  19. {docling_core-2.27.0 → docling_core-2.28.1}/pyproject.toml +9 -2
  20. {docling_core-2.27.0 → docling_core-2.28.1}/LICENSE +0 -0
  21. {docling_core-2.27.0 → docling_core-2.28.1}/README.md +0 -0
  22. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/__init__.py +0 -0
  23. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/cli/__init__.py +0 -0
  24. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/cli/view.py +0 -0
  25. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/__init__.py +0 -0
  26. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/__init__.py +0 -0
  27. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/html_styles.py +0 -0
  28. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/py.typed +0 -0
  29. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  30. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  31. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  32. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  33. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  34. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  35. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  36. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  37. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/__init__.py +0 -0
  38. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  39. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/mapping.py +0 -0
  40. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/meta.py +0 -0
  41. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/package.py +0 -0
  42. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/__init__.py +0 -0
  43. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/__init__.py +0 -0
  44. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/base.py +0 -0
  45. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  46. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/__init__.py +0 -0
  47. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/base.py +0 -0
  48. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/__init__.py +0 -0
  49. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/base.py +0 -0
  50. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/labels.py +0 -0
  51. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/tokens.py +0 -0
  52. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/utils.py +0 -0
  53. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/gen/__init__.py +0 -0
  54. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/gen/generic.py +0 -0
  55. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/io/__init__.py +0 -0
  56. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  57. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/base.py +0 -0
  58. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  59. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  60. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  61. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  62. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/nlp/__init__.py +0 -0
  63. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/nlp/qa.py +0 -0
  64. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/nlp/qa_labels.py +0 -0
  65. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/__init__.py +0 -0
  66. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/attribute.py +0 -0
  67. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/base.py +0 -0
  68. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/predicate.py +0 -0
  69. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/record.py +0 -0
  70. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/statement.py +0 -0
  71. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/subject.py +0 -0
  72. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/__init__.py +0 -0
  73. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/alias.py +0 -0
  74. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/file.py +0 -0
  75. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/generate_docs.py +0 -0
  76. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/generate_jsonschema.py +0 -0
  77. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/legacy.py +0 -0
  78. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/validate.py +0 -0
  79. {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.27.0
3
+ Version: 2.28.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -26,6 +26,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Classifier: Typing :: Typed
28
28
  Provides-Extra: chunking
29
+ Provides-Extra: chunking-openai
29
30
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
31
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
32
  Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
@@ -33,8 +34,9 @@ Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
34
  Requires-Dist: pillow (>=10.0.0,<12.0.0)
34
35
  Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
35
36
  Requires-Dist: pyyaml (>=5.1,<7.0.0)
36
- Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
37
+ Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking" or extra == "chunking-openai"
37
38
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
39
+ Requires-Dist: tiktoken (>=0.9.0,<0.10.0) ; extra == "chunking-openai"
38
40
  Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
39
41
  Requires-Dist: typer (>=0.12.5,<0.16.0)
40
42
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
@@ -6,7 +6,7 @@
6
6
  """Define base classes for serialization."""
7
7
  from abc import ABC, abstractmethod
8
8
  from pathlib import Path
9
- from typing import Optional, Union
9
+ from typing import Any, Optional, Union
10
10
 
11
11
  from pydantic import AnyUrl, BaseModel
12
12
 
@@ -51,7 +51,7 @@ class BaseTextSerializer(ABC):
51
51
  item: TextItem,
52
52
  doc_serializer: "BaseDocSerializer",
53
53
  doc: DoclingDocument,
54
- **kwargs,
54
+ **kwargs: Any,
55
55
  ) -> SerializationResult:
56
56
  """Serializes the passed item."""
57
57
  ...
@@ -67,7 +67,7 @@ class BaseTableSerializer(ABC):
67
67
  item: TableItem,
68
68
  doc_serializer: "BaseDocSerializer",
69
69
  doc: DoclingDocument,
70
- **kwargs,
70
+ **kwargs: Any,
71
71
  ) -> SerializationResult:
72
72
  """Serializes the passed item."""
73
73
  ...
@@ -83,7 +83,7 @@ class BasePictureSerializer(ABC):
83
83
  item: PictureItem,
84
84
  doc_serializer: "BaseDocSerializer",
85
85
  doc: DoclingDocument,
86
- **kwargs,
86
+ **kwargs: Any,
87
87
  ) -> SerializationResult:
88
88
  """Serializes the passed item."""
89
89
  ...
@@ -99,7 +99,7 @@ class BaseKeyValueSerializer(ABC):
99
99
  item: KeyValueItem,
100
100
  doc_serializer: "BaseDocSerializer",
101
101
  doc: DoclingDocument,
102
- **kwargs,
102
+ **kwargs: Any,
103
103
  ) -> SerializationResult:
104
104
  """Serializes the passed item."""
105
105
  ...
@@ -115,7 +115,7 @@ class BaseFormSerializer(ABC):
115
115
  item: FormItem,
116
116
  doc_serializer: "BaseDocSerializer",
117
117
  doc: DoclingDocument,
118
- **kwargs,
118
+ **kwargs: Any,
119
119
  ) -> SerializationResult:
120
120
  """Serializes the passed item."""
121
121
  ...
@@ -131,7 +131,7 @@ class BaseListSerializer(ABC):
131
131
  item: Union[UnorderedList, OrderedList],
132
132
  doc_serializer: "BaseDocSerializer",
133
133
  doc: DoclingDocument,
134
- **kwargs,
134
+ **kwargs: Any,
135
135
  ) -> SerializationResult:
136
136
  """Serializes the passed item."""
137
137
  ...
@@ -147,7 +147,7 @@ class BaseInlineSerializer(ABC):
147
147
  item: InlineGroup,
148
148
  doc_serializer: "BaseDocSerializer",
149
149
  doc: DoclingDocument,
150
- **kwargs,
150
+ **kwargs: Any,
151
151
  ) -> SerializationResult:
152
152
  """Serializes the passed item."""
153
153
  ...
@@ -163,7 +163,7 @@ class BaseFallbackSerializer(ABC):
163
163
  item: NodeItem,
164
164
  doc_serializer: "BaseDocSerializer",
165
165
  doc: DoclingDocument,
166
- **kwargs,
166
+ **kwargs: Any,
167
167
  ) -> SerializationResult:
168
168
  """Serializes the passed item."""
169
169
  ...
@@ -174,34 +174,40 @@ class BaseDocSerializer(ABC):
174
174
 
175
175
  @abstractmethod
176
176
  def serialize(
177
- self, *, item: Optional[NodeItem] = None, **kwargs
177
+ self,
178
+ *,
179
+ item: Optional[NodeItem] = None,
180
+ **kwargs: Any,
178
181
  ) -> SerializationResult:
179
182
  """Run the serialization."""
180
183
  ...
181
184
 
182
185
  @abstractmethod
183
- def serialize_bold(self, text: str, **kwargs) -> str:
186
+ def serialize_bold(self, text: str, **kwargs: Any) -> str:
184
187
  """Hook for bold formatting serialization."""
185
188
  ...
186
189
 
187
190
  @abstractmethod
188
- def serialize_italic(self, text: str, **kwargs) -> str:
191
+ def serialize_italic(self, text: str, **kwargs: Any) -> str:
189
192
  """Hook for italic formatting serialization."""
190
193
  ...
191
194
 
192
195
  @abstractmethod
193
- def serialize_underline(self, text: str, **kwargs) -> str:
196
+ def serialize_underline(self, text: str, **kwargs: Any) -> str:
194
197
  """Hook for underline formatting serialization."""
195
198
  ...
196
199
 
197
200
  @abstractmethod
198
- def serialize_strikethrough(self, text: str, **kwargs) -> str:
201
+ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
199
202
  """Hook for strikethrough formatting serialization."""
200
203
  ...
201
204
 
202
205
  @abstractmethod
203
206
  def serialize_hyperlink(
204
- self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
207
+ self,
208
+ text: str,
209
+ hyperlink: Union[AnyUrl, Path],
210
+ **kwargs: Any,
205
211
  ) -> str:
206
212
  """Hook for hyperlink serialization."""
207
213
  ...
@@ -210,7 +216,7 @@ class BaseDocSerializer(ABC):
210
216
  def get_parts(
211
217
  self,
212
218
  item: Optional[NodeItem] = None,
213
- **kwargs,
219
+ **kwargs: Any,
214
220
  ) -> list[SerializationResult]:
215
221
  """Get the components to be combined for serializing this node."""
216
222
  ...
@@ -219,7 +225,7 @@ class BaseDocSerializer(ABC):
219
225
  def post_process(
220
226
  self,
221
227
  text: str,
222
- **kwargs,
228
+ **kwargs: Any,
223
229
  ) -> str:
224
230
  """Apply some text post-processing steps."""
225
231
  ...
@@ -228,13 +234,13 @@ class BaseDocSerializer(ABC):
228
234
  def serialize_captions(
229
235
  self,
230
236
  item: FloatingItem,
231
- **kwargs,
237
+ **kwargs: Any,
232
238
  ) -> SerializationResult:
233
239
  """Serialize the item's captions."""
234
240
  ...
235
241
 
236
242
  @abstractmethod
237
- def get_excluded_refs(self, **kwargs) -> set[str]:
243
+ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
238
244
  """Get references to excluded items."""
239
245
  ...
240
246
 
@@ -214,7 +214,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
214
214
  return refs
215
215
 
216
216
  @override
217
- def get_excluded_refs(self, **kwargs) -> set[str]:
217
+ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
218
218
  """References to excluded items."""
219
219
  params = self.params.merge_with_patch(patch=kwargs)
220
220
  params_json = params.model_dump_json()
@@ -252,7 +252,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
252
252
 
253
253
  @abstractmethod
254
254
  def serialize_doc(
255
- self, *, parts: list[SerializationResult], **kwargs
255
+ self,
256
+ *,
257
+ parts: list[SerializationResult],
258
+ **kwargs: Any,
256
259
  ) -> SerializationResult:
257
260
  """Serialize a document out of its pages."""
258
261
  ...
@@ -271,7 +274,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
271
274
  list_level: int = 0,
272
275
  is_inline_scope: bool = False,
273
276
  visited: Optional[set[str]] = None, # refs of visited items
274
- **kwargs,
277
+ **kwargs: Any,
275
278
  ) -> SerializationResult:
276
279
  """Serialize a given node."""
277
280
  my_visited: set[str] = visited if visited is not None else set()
@@ -380,7 +383,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
380
383
  list_level: int = 0,
381
384
  is_inline_scope: bool = False,
382
385
  visited: Optional[set[str]] = None, # refs of visited items
383
- **kwargs,
386
+ **kwargs: Any,
384
387
  ) -> list[SerializationResult]:
385
388
  """Get the components to be combined for serializing this node."""
386
389
  parts: list[SerializationResult] = []
@@ -415,7 +418,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
415
418
  *,
416
419
  formatting: Optional[Formatting] = None,
417
420
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
418
- **kwargs,
421
+ **kwargs: Any,
419
422
  ) -> str:
420
423
  """Apply some text post-processing steps."""
421
424
  params = self.params.merge_with_patch(patch=kwargs)
@@ -434,28 +437,31 @@ class DocSerializer(BaseModel, BaseDocSerializer):
434
437
  return res
435
438
 
436
439
  @override
437
- def serialize_bold(self, text: str, **kwargs) -> str:
440
+ def serialize_bold(self, text: str, **kwargs: Any) -> str:
438
441
  """Hook for bold formatting serialization."""
439
442
  return text
440
443
 
441
444
  @override
442
- def serialize_italic(self, text: str, **kwargs) -> str:
445
+ def serialize_italic(self, text: str, **kwargs: Any) -> str:
443
446
  """Hook for italic formatting serialization."""
444
447
  return text
445
448
 
446
449
  @override
447
- def serialize_underline(self, text: str, **kwargs) -> str:
450
+ def serialize_underline(self, text: str, **kwargs: Any) -> str:
448
451
  """Hook for underline formatting serialization."""
449
452
  return text
450
453
 
451
454
  @override
452
- def serialize_strikethrough(self, text: str, **kwargs) -> str:
455
+ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
453
456
  """Hook for strikethrough formatting serialization."""
454
457
  return text
455
458
 
456
459
  @override
457
460
  def serialize_hyperlink(
458
- self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
461
+ self,
462
+ text: str,
463
+ hyperlink: Union[AnyUrl, Path],
464
+ **kwargs: Any,
459
465
  ) -> str:
460
466
  """Hook for hyperlink serialization."""
461
467
  return text
@@ -464,7 +470,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
464
470
  def serialize_captions(
465
471
  self,
466
472
  item: FloatingItem,
467
- **kwargs,
473
+ **kwargs: Any,
468
474
  ) -> SerializationResult:
469
475
  """Serialize the item's captions."""
470
476
  params = self.params.merge_with_patch(patch=kwargs)
@@ -1,7 +1,7 @@
1
1
  """Define classes for Doctags serialization."""
2
2
 
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
  from typing_extensions import override
@@ -91,7 +91,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
91
91
  item: TextItem,
92
92
  doc_serializer: BaseDocSerializer,
93
93
  doc: DoclingDocument,
94
- **kwargs,
94
+ **kwargs: Any,
95
95
  ) -> SerializationResult:
96
96
  """Serializes the passed item."""
97
97
  from docling_core.types.doc.document import SectionHeaderItem
@@ -154,7 +154,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
154
154
  item: TableItem,
155
155
  doc_serializer: BaseDocSerializer,
156
156
  doc: DoclingDocument,
157
- **kwargs,
157
+ **kwargs: Any,
158
158
  ) -> SerializationResult:
159
159
  """Serializes the passed item."""
160
160
  params = DocTagsParams(**kwargs)
@@ -201,7 +201,7 @@ class DocTagsPictureSerializer(BasePictureSerializer):
201
201
  item: PictureItem,
202
202
  doc_serializer: BaseDocSerializer,
203
203
  doc: DoclingDocument,
204
- **kwargs,
204
+ **kwargs: Any,
205
205
  ) -> SerializationResult:
206
206
  """Serializes the passed item."""
207
207
  params = DocTagsParams(**kwargs)
@@ -284,7 +284,7 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
284
284
  item: KeyValueItem,
285
285
  doc_serializer: "BaseDocSerializer",
286
286
  doc: DoclingDocument,
287
- **kwargs,
287
+ **kwargs: Any,
288
288
  ) -> SerializationResult:
289
289
  """Serializes the passed item."""
290
290
  params = DocTagsParams(**kwargs)
@@ -356,7 +356,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
356
356
  item: FormItem,
357
357
  doc_serializer: "BaseDocSerializer",
358
358
  doc: DoclingDocument,
359
- **kwargs,
359
+ **kwargs: Any,
360
360
  ) -> SerializationResult:
361
361
  """Serializes the passed item."""
362
362
  # TODO add actual implementation
@@ -378,7 +378,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
378
378
  list_level: int = 0,
379
379
  is_inline_scope: bool = False,
380
380
  visited: Optional[set[str]] = None, # refs of visited items
381
- **kwargs,
381
+ **kwargs: Any,
382
382
  ) -> SerializationResult:
383
383
  """Serializes the passed item."""
384
384
  my_visited = visited if visited is not None else set()
@@ -423,7 +423,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
423
423
  doc: DoclingDocument,
424
424
  list_level: int = 0,
425
425
  visited: Optional[set[str]] = None, # refs of visited items
426
- **kwargs,
426
+ **kwargs: Any,
427
427
  ) -> SerializationResult:
428
428
  """Serializes the passed item."""
429
429
  my_visited = visited if visited is not None else set()
@@ -454,7 +454,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
454
454
  item: NodeItem,
455
455
  doc_serializer: "BaseDocSerializer",
456
456
  doc: DoclingDocument,
457
- **kwargs,
457
+ **kwargs: Any,
458
458
  ) -> SerializationResult:
459
459
  """Serializes the passed item."""
460
460
  return create_ser_result()
@@ -477,7 +477,10 @@ class DocTagsDocSerializer(DocSerializer):
477
477
 
478
478
  @override
479
479
  def serialize_doc(
480
- self, *, parts: list[SerializationResult], **kwargs
480
+ self,
481
+ *,
482
+ parts: list[SerializationResult],
483
+ **kwargs: Any,
481
484
  ) -> SerializationResult:
482
485
  """Serialize a document out of its pages."""
483
486
  delim = _get_delim(params=self.params)
@@ -496,7 +499,7 @@ class DocTagsDocSerializer(DocSerializer):
496
499
  def serialize_captions(
497
500
  self,
498
501
  item: FloatingItem,
499
- **kwargs,
502
+ **kwargs: Any,
500
503
  ) -> SerializationResult:
501
504
  """Serialize the item's captions."""
502
505
  params = DocTagsParams(**kwargs)
@@ -10,7 +10,7 @@ import logging
10
10
  from enum import Enum
11
11
  from io import BytesIO
12
12
  from pathlib import Path
13
- from typing import Optional, Union
13
+ from typing import Any, Optional, Union
14
14
  from urllib.parse import quote
15
15
  from xml.etree.cElementTree import SubElement, tostring
16
16
  from xml.sax.saxutils import unescape
@@ -120,7 +120,7 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
120
120
  doc_serializer: BaseDocSerializer,
121
121
  doc: DoclingDocument,
122
122
  is_inline_scope: bool = False,
123
- **kwargs,
123
+ **kwargs: Any,
124
124
  ) -> SerializationResult:
125
125
  """Serializes the passed text item to HTML."""
126
126
  params = HTMLParams(**kwargs)
@@ -296,7 +296,7 @@ class HTMLTableSerializer(BaseTableSerializer):
296
296
  item: TableItem,
297
297
  doc_serializer: BaseDocSerializer,
298
298
  doc: DoclingDocument,
299
- **kwargs,
299
+ **kwargs: Any,
300
300
  ) -> SerializationResult:
301
301
  """Serializes the passed table item to HTML."""
302
302
  nrows = item.data.num_rows
@@ -367,7 +367,7 @@ class HTMLPictureSerializer(BasePictureSerializer):
367
367
  item: PictureItem,
368
368
  doc_serializer: BaseDocSerializer,
369
369
  doc: DoclingDocument,
370
- **kwargs,
370
+ **kwargs: Any,
371
371
  ) -> SerializationResult:
372
372
  """Export picture to HTML format."""
373
373
  params = HTMLParams(**kwargs)
@@ -574,7 +574,7 @@ class HTMLKeyValueSerializer(BaseKeyValueSerializer):
574
574
  item: KeyValueItem,
575
575
  doc_serializer: "BaseDocSerializer",
576
576
  doc: DoclingDocument,
577
- **kwargs,
577
+ **kwargs: Any,
578
578
  ) -> SerializationResult:
579
579
  """Serializes the passed key-value item to HTML."""
580
580
  res_parts: list[SerializationResult] = []
@@ -611,7 +611,7 @@ class HTMLFormSerializer(BaseFormSerializer):
611
611
  item: FormItem,
612
612
  doc_serializer: "BaseDocSerializer",
613
613
  doc: DoclingDocument,
614
- **kwargs,
614
+ **kwargs: Any,
615
615
  ) -> SerializationResult:
616
616
  """Serializes the passed form item to HTML."""
617
617
  res_parts: list[SerializationResult] = []
@@ -651,7 +651,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
651
651
  list_level: int = 0,
652
652
  is_inline_scope: bool = False,
653
653
  visited: Optional[set[str]] = None, # refs of visited items
654
- **kwargs,
654
+ **kwargs: Any,
655
655
  ) -> SerializationResult:
656
656
  """Serializes a list to HTML."""
657
657
  my_visited: set[str] = visited if visited is not None else set()
@@ -699,7 +699,7 @@ class HTMLInlineSerializer(BaseInlineSerializer):
699
699
  doc: DoclingDocument,
700
700
  list_level: int = 0,
701
701
  visited: Optional[set[str]] = None, # refs of visited items
702
- **kwargs,
702
+ **kwargs: Any,
703
703
  ) -> SerializationResult:
704
704
  """Serializes an inline group to HTML."""
705
705
  my_visited: set[str] = visited if visited is not None else set()
@@ -733,7 +733,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
733
733
  item: NodeItem,
734
734
  doc_serializer: "BaseDocSerializer",
735
735
  doc: DoclingDocument,
736
- **kwargs,
736
+ **kwargs: Any,
737
737
  ) -> SerializationResult:
738
738
  """Fallback serializer for items not handled by other serializers."""
739
739
  if isinstance(item, DocItem):
@@ -762,35 +762,40 @@ class HTMLDocSerializer(DocSerializer):
762
762
  params: HTMLParams = HTMLParams()
763
763
 
764
764
  @override
765
- def serialize_bold(self, text: str, **kwargs) -> str:
765
+ def serialize_bold(self, text: str, **kwargs: Any) -> str:
766
766
  """Apply HTML-specific bold serialization."""
767
767
  return f"<strong>{text}</strong>"
768
768
 
769
769
  @override
770
- def serialize_italic(self, text: str, **kwargs) -> str:
770
+ def serialize_italic(self, text: str, **kwargs: Any) -> str:
771
771
  """Apply HTML-specific italic serialization."""
772
772
  return f"<em>{text}</em>"
773
773
 
774
774
  @override
775
- def serialize_underline(self, text: str, **kwargs) -> str:
775
+ def serialize_underline(self, text: str, **kwargs: Any) -> str:
776
776
  """Apply HTML-specific underline serialization."""
777
777
  return f"<u>{text}</u>"
778
778
 
779
779
  @override
780
- def serialize_strikethrough(self, text: str, **kwargs) -> str:
780
+ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
781
781
  """Apply HTML-specific strikethrough serialization."""
782
782
  return f"<del>{text}</del>"
783
783
 
784
784
  @override
785
785
  def serialize_hyperlink(
786
- self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
786
+ self,
787
+ text: str,
788
+ hyperlink: Union[AnyUrl, Path],
789
+ **kwargs: Any,
787
790
  ) -> str:
788
791
  """Apply HTML-specific hyperlink serialization."""
789
792
  return f'<a href="{str(hyperlink)}">{text}</a>'
790
793
 
791
794
  @override
792
795
  def serialize_doc(
793
- self, parts: list[SerializationResult], **kwargs
796
+ self,
797
+ parts: list[SerializationResult],
798
+ **kwargs: Any,
794
799
  ) -> SerializationResult:
795
800
  """Serialize a document out of its pages."""
796
801
  # Create HTML structure
@@ -895,7 +900,7 @@ class HTMLDocSerializer(DocSerializer):
895
900
  self,
896
901
  item: FloatingItem,
897
902
  tag: str = "figcaption",
898
- **kwargs,
903
+ **kwargs: Any,
899
904
  ) -> SerializationResult:
900
905
  """Serialize the item's captions."""
901
906
  params = self.params.merge_with_patch(patch=kwargs)
@@ -8,7 +8,7 @@ import html
8
8
  import re
9
9
  import textwrap
10
10
  from pathlib import Path
11
- from typing import Optional, Union
11
+ from typing import Any, Optional, Union
12
12
 
13
13
  from pydantic import AnyUrl, BaseModel, PositiveInt
14
14
  from tabulate import tabulate
@@ -82,7 +82,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
82
82
  doc_serializer: BaseDocSerializer,
83
83
  doc: DoclingDocument,
84
84
  is_inline_scope: bool = False,
85
- **kwargs,
85
+ **kwargs: Any,
86
86
  ) -> SerializationResult:
87
87
  """Serializes the passed item."""
88
88
  params = MarkdownParams(**kwargs)
@@ -143,7 +143,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
143
143
  item: TableItem,
144
144
  doc_serializer: BaseDocSerializer,
145
145
  doc: DoclingDocument,
146
- **kwargs,
146
+ **kwargs: Any,
147
147
  ) -> SerializationResult:
148
148
  """Serializes the passed item."""
149
149
  res_parts: list[SerializationResult] = []
@@ -195,7 +195,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
195
195
  item: PictureItem,
196
196
  doc_serializer: BaseDocSerializer,
197
197
  doc: DoclingDocument,
198
- **kwargs,
198
+ **kwargs: Any,
199
199
  ) -> SerializationResult:
200
200
  """Serializes the passed item."""
201
201
  params = MarkdownParams(**kwargs)
@@ -246,7 +246,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
246
246
  doc: DoclingDocument,
247
247
  image_mode: ImageRefMode,
248
248
  image_placeholder: str,
249
- **kwargs,
249
+ **kwargs: Any,
250
250
  ) -> SerializationResult:
251
251
  error_response = (
252
252
  "<!-- 🖼️❌ Image not available. "
@@ -298,7 +298,7 @@ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
298
298
  item: KeyValueItem,
299
299
  doc_serializer: "BaseDocSerializer",
300
300
  doc: DoclingDocument,
301
- **kwargs,
301
+ **kwargs: Any,
302
302
  ) -> SerializationResult:
303
303
  """Serializes the passed item."""
304
304
  # TODO add actual implementation
@@ -321,7 +321,7 @@ class MarkdownFormSerializer(BaseFormSerializer):
321
321
  item: FormItem,
322
322
  doc_serializer: "BaseDocSerializer",
323
323
  doc: DoclingDocument,
324
- **kwargs,
324
+ **kwargs: Any,
325
325
  ) -> SerializationResult:
326
326
  """Serializes the passed item."""
327
327
  # TODO add actual implementation
@@ -347,7 +347,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
347
347
  list_level: int = 0,
348
348
  is_inline_scope: bool = False,
349
349
  visited: Optional[set[str]] = None, # refs of visited items
350
- **kwargs,
350
+ **kwargs: Any,
351
351
  ) -> SerializationResult:
352
352
  """Serializes the passed item."""
353
353
  params = MarkdownParams(**kwargs)
@@ -400,7 +400,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
400
400
  doc: DoclingDocument,
401
401
  list_level: int = 0,
402
402
  visited: Optional[set[str]] = None, # refs of visited items
403
- **kwargs,
403
+ **kwargs: Any,
404
404
  ) -> SerializationResult:
405
405
  """Serializes the passed item."""
406
406
  my_visited = visited if visited is not None else set()
@@ -425,7 +425,7 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
425
425
  item: NodeItem,
426
426
  doc_serializer: "BaseDocSerializer",
427
427
  doc: DoclingDocument,
428
- **kwargs,
428
+ **kwargs: Any,
429
429
  ) -> SerializationResult:
430
430
  """Serializes the passed item."""
431
431
  if isinstance(item, DocItem):
@@ -453,22 +453,27 @@ class MarkdownDocSerializer(DocSerializer):
453
453
  params: MarkdownParams = MarkdownParams()
454
454
 
455
455
  @override
456
- def serialize_bold(self, text: str, **kwargs):
456
+ def serialize_bold(self, text: str, **kwargs: Any):
457
457
  """Apply Markdown-specific bold serialization."""
458
458
  return f"**{text}**"
459
459
 
460
460
  @override
461
- def serialize_italic(self, text: str, **kwargs):
461
+ def serialize_italic(self, text: str, **kwargs: Any):
462
462
  """Apply Markdown-specific italic serialization."""
463
463
  return f"*{text}*"
464
464
 
465
465
  @override
466
- def serialize_strikethrough(self, text: str, **kwargs):
466
+ def serialize_strikethrough(self, text: str, **kwargs: Any):
467
467
  """Apply Markdown-specific strikethrough serialization."""
468
468
  return f"~~{text}~~"
469
469
 
470
470
  @override
471
- def serialize_hyperlink(self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs):
471
+ def serialize_hyperlink(
472
+ self,
473
+ text: str,
474
+ hyperlink: Union[AnyUrl, Path],
475
+ **kwargs: Any,
476
+ ):
472
477
  """Apply Markdown-specific hyperlink serialization."""
473
478
  return f"[{text}]({str(hyperlink)})"
474
479
 
@@ -505,7 +510,7 @@ class MarkdownDocSerializer(DocSerializer):
505
510
  escape_underscores: bool = True,
506
511
  formatting: Optional[Formatting] = None,
507
512
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
508
- **kwargs,
513
+ **kwargs: Any,
509
514
  ) -> str:
510
515
  """Apply some text post-processing steps."""
511
516
  res = text
@@ -523,7 +528,10 @@ class MarkdownDocSerializer(DocSerializer):
523
528
 
524
529
  @override
525
530
  def serialize_doc(
526
- self, *, parts: list[SerializationResult], **kwargs
531
+ self,
532
+ *,
533
+ parts: list[SerializationResult],
534
+ **kwargs: Any,
527
535
  ) -> SerializationResult:
528
536
  """Serialize a document out of its parts."""
529
537
  text_res = "\n\n".join([p.text for p in parts if p.text])