docling-core 2.27.0__tar.gz → 2.28.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.27.0 → docling_core-2.28.1}/PKG-INFO +4 -2
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/base.py +25 -19
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/common.py +17 -11
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/doctags.py +14 -11
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/html.py +21 -16
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/markdown.py +24 -16
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/hybrid_chunker.py +49 -31
- docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/__init__.py +1 -0
- docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/base.py +25 -0
- docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/huggingface.py +70 -0
- docling_core-2.28.1/docling_core/transforms/chunker/tokenizer/openai.py +34 -0
- docling_core-2.28.1/docling_core/transforms/visualizer/__init__.py +1 -0
- docling_core-2.28.1/docling_core/transforms/visualizer/base.py +23 -0
- docling_core-2.28.1/docling_core/transforms/visualizer/layout_visualizer.py +212 -0
- docling_core-2.28.1/docling_core/transforms/visualizer/reading_order_visualizer.py +149 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/document.py +25 -3
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/page.py +4 -3
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/document.py +2 -2
- {docling_core-2.27.0 → docling_core-2.28.1}/pyproject.toml +9 -2
- {docling_core-2.27.0 → docling_core-2.28.1}/LICENSE +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/README.md +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/html_styles.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/py.typed +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/search/package.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/base.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.27.0 → docling_core-2.28.1}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.28.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://github.com/docling-project
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Classifier: Typing :: Typed
|
|
28
28
|
Provides-Extra: chunking
|
|
29
|
+
Provides-Extra: chunking-openai
|
|
29
30
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
31
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
32
|
Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
|
|
@@ -33,8 +34,9 @@ Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
|
33
34
|
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
|
34
35
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
35
36
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
36
|
-
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
|
|
37
|
+
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking" or extra == "chunking-openai"
|
|
37
38
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
39
|
+
Requires-Dist: tiktoken (>=0.9.0,<0.10.0) ; extra == "chunking-openai"
|
|
38
40
|
Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
|
|
39
41
|
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
|
40
42
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Define base classes for serialization."""
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Optional, Union
|
|
9
|
+
from typing import Any, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import AnyUrl, BaseModel
|
|
12
12
|
|
|
@@ -51,7 +51,7 @@ class BaseTextSerializer(ABC):
|
|
|
51
51
|
item: TextItem,
|
|
52
52
|
doc_serializer: "BaseDocSerializer",
|
|
53
53
|
doc: DoclingDocument,
|
|
54
|
-
**kwargs,
|
|
54
|
+
**kwargs: Any,
|
|
55
55
|
) -> SerializationResult:
|
|
56
56
|
"""Serializes the passed item."""
|
|
57
57
|
...
|
|
@@ -67,7 +67,7 @@ class BaseTableSerializer(ABC):
|
|
|
67
67
|
item: TableItem,
|
|
68
68
|
doc_serializer: "BaseDocSerializer",
|
|
69
69
|
doc: DoclingDocument,
|
|
70
|
-
**kwargs,
|
|
70
|
+
**kwargs: Any,
|
|
71
71
|
) -> SerializationResult:
|
|
72
72
|
"""Serializes the passed item."""
|
|
73
73
|
...
|
|
@@ -83,7 +83,7 @@ class BasePictureSerializer(ABC):
|
|
|
83
83
|
item: PictureItem,
|
|
84
84
|
doc_serializer: "BaseDocSerializer",
|
|
85
85
|
doc: DoclingDocument,
|
|
86
|
-
**kwargs,
|
|
86
|
+
**kwargs: Any,
|
|
87
87
|
) -> SerializationResult:
|
|
88
88
|
"""Serializes the passed item."""
|
|
89
89
|
...
|
|
@@ -99,7 +99,7 @@ class BaseKeyValueSerializer(ABC):
|
|
|
99
99
|
item: KeyValueItem,
|
|
100
100
|
doc_serializer: "BaseDocSerializer",
|
|
101
101
|
doc: DoclingDocument,
|
|
102
|
-
**kwargs,
|
|
102
|
+
**kwargs: Any,
|
|
103
103
|
) -> SerializationResult:
|
|
104
104
|
"""Serializes the passed item."""
|
|
105
105
|
...
|
|
@@ -115,7 +115,7 @@ class BaseFormSerializer(ABC):
|
|
|
115
115
|
item: FormItem,
|
|
116
116
|
doc_serializer: "BaseDocSerializer",
|
|
117
117
|
doc: DoclingDocument,
|
|
118
|
-
**kwargs,
|
|
118
|
+
**kwargs: Any,
|
|
119
119
|
) -> SerializationResult:
|
|
120
120
|
"""Serializes the passed item."""
|
|
121
121
|
...
|
|
@@ -131,7 +131,7 @@ class BaseListSerializer(ABC):
|
|
|
131
131
|
item: Union[UnorderedList, OrderedList],
|
|
132
132
|
doc_serializer: "BaseDocSerializer",
|
|
133
133
|
doc: DoclingDocument,
|
|
134
|
-
**kwargs,
|
|
134
|
+
**kwargs: Any,
|
|
135
135
|
) -> SerializationResult:
|
|
136
136
|
"""Serializes the passed item."""
|
|
137
137
|
...
|
|
@@ -147,7 +147,7 @@ class BaseInlineSerializer(ABC):
|
|
|
147
147
|
item: InlineGroup,
|
|
148
148
|
doc_serializer: "BaseDocSerializer",
|
|
149
149
|
doc: DoclingDocument,
|
|
150
|
-
**kwargs,
|
|
150
|
+
**kwargs: Any,
|
|
151
151
|
) -> SerializationResult:
|
|
152
152
|
"""Serializes the passed item."""
|
|
153
153
|
...
|
|
@@ -163,7 +163,7 @@ class BaseFallbackSerializer(ABC):
|
|
|
163
163
|
item: NodeItem,
|
|
164
164
|
doc_serializer: "BaseDocSerializer",
|
|
165
165
|
doc: DoclingDocument,
|
|
166
|
-
**kwargs,
|
|
166
|
+
**kwargs: Any,
|
|
167
167
|
) -> SerializationResult:
|
|
168
168
|
"""Serializes the passed item."""
|
|
169
169
|
...
|
|
@@ -174,34 +174,40 @@ class BaseDocSerializer(ABC):
|
|
|
174
174
|
|
|
175
175
|
@abstractmethod
|
|
176
176
|
def serialize(
|
|
177
|
-
self,
|
|
177
|
+
self,
|
|
178
|
+
*,
|
|
179
|
+
item: Optional[NodeItem] = None,
|
|
180
|
+
**kwargs: Any,
|
|
178
181
|
) -> SerializationResult:
|
|
179
182
|
"""Run the serialization."""
|
|
180
183
|
...
|
|
181
184
|
|
|
182
185
|
@abstractmethod
|
|
183
|
-
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
186
|
+
def serialize_bold(self, text: str, **kwargs: Any) -> str:
|
|
184
187
|
"""Hook for bold formatting serialization."""
|
|
185
188
|
...
|
|
186
189
|
|
|
187
190
|
@abstractmethod
|
|
188
|
-
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
191
|
+
def serialize_italic(self, text: str, **kwargs: Any) -> str:
|
|
189
192
|
"""Hook for italic formatting serialization."""
|
|
190
193
|
...
|
|
191
194
|
|
|
192
195
|
@abstractmethod
|
|
193
|
-
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
196
|
+
def serialize_underline(self, text: str, **kwargs: Any) -> str:
|
|
194
197
|
"""Hook for underline formatting serialization."""
|
|
195
198
|
...
|
|
196
199
|
|
|
197
200
|
@abstractmethod
|
|
198
|
-
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
201
|
+
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
|
|
199
202
|
"""Hook for strikethrough formatting serialization."""
|
|
200
203
|
...
|
|
201
204
|
|
|
202
205
|
@abstractmethod
|
|
203
206
|
def serialize_hyperlink(
|
|
204
|
-
self,
|
|
207
|
+
self,
|
|
208
|
+
text: str,
|
|
209
|
+
hyperlink: Union[AnyUrl, Path],
|
|
210
|
+
**kwargs: Any,
|
|
205
211
|
) -> str:
|
|
206
212
|
"""Hook for hyperlink serialization."""
|
|
207
213
|
...
|
|
@@ -210,7 +216,7 @@ class BaseDocSerializer(ABC):
|
|
|
210
216
|
def get_parts(
|
|
211
217
|
self,
|
|
212
218
|
item: Optional[NodeItem] = None,
|
|
213
|
-
**kwargs,
|
|
219
|
+
**kwargs: Any,
|
|
214
220
|
) -> list[SerializationResult]:
|
|
215
221
|
"""Get the components to be combined for serializing this node."""
|
|
216
222
|
...
|
|
@@ -219,7 +225,7 @@ class BaseDocSerializer(ABC):
|
|
|
219
225
|
def post_process(
|
|
220
226
|
self,
|
|
221
227
|
text: str,
|
|
222
|
-
**kwargs,
|
|
228
|
+
**kwargs: Any,
|
|
223
229
|
) -> str:
|
|
224
230
|
"""Apply some text post-processing steps."""
|
|
225
231
|
...
|
|
@@ -228,13 +234,13 @@ class BaseDocSerializer(ABC):
|
|
|
228
234
|
def serialize_captions(
|
|
229
235
|
self,
|
|
230
236
|
item: FloatingItem,
|
|
231
|
-
**kwargs,
|
|
237
|
+
**kwargs: Any,
|
|
232
238
|
) -> SerializationResult:
|
|
233
239
|
"""Serialize the item's captions."""
|
|
234
240
|
...
|
|
235
241
|
|
|
236
242
|
@abstractmethod
|
|
237
|
-
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
243
|
+
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
238
244
|
"""Get references to excluded items."""
|
|
239
245
|
...
|
|
240
246
|
|
|
@@ -214,7 +214,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
214
214
|
return refs
|
|
215
215
|
|
|
216
216
|
@override
|
|
217
|
-
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
217
|
+
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
218
218
|
"""References to excluded items."""
|
|
219
219
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
220
220
|
params_json = params.model_dump_json()
|
|
@@ -252,7 +252,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
252
252
|
|
|
253
253
|
@abstractmethod
|
|
254
254
|
def serialize_doc(
|
|
255
|
-
self,
|
|
255
|
+
self,
|
|
256
|
+
*,
|
|
257
|
+
parts: list[SerializationResult],
|
|
258
|
+
**kwargs: Any,
|
|
256
259
|
) -> SerializationResult:
|
|
257
260
|
"""Serialize a document out of its pages."""
|
|
258
261
|
...
|
|
@@ -271,7 +274,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
271
274
|
list_level: int = 0,
|
|
272
275
|
is_inline_scope: bool = False,
|
|
273
276
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
274
|
-
**kwargs,
|
|
277
|
+
**kwargs: Any,
|
|
275
278
|
) -> SerializationResult:
|
|
276
279
|
"""Serialize a given node."""
|
|
277
280
|
my_visited: set[str] = visited if visited is not None else set()
|
|
@@ -380,7 +383,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
380
383
|
list_level: int = 0,
|
|
381
384
|
is_inline_scope: bool = False,
|
|
382
385
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
383
|
-
**kwargs,
|
|
386
|
+
**kwargs: Any,
|
|
384
387
|
) -> list[SerializationResult]:
|
|
385
388
|
"""Get the components to be combined for serializing this node."""
|
|
386
389
|
parts: list[SerializationResult] = []
|
|
@@ -415,7 +418,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
415
418
|
*,
|
|
416
419
|
formatting: Optional[Formatting] = None,
|
|
417
420
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
418
|
-
**kwargs,
|
|
421
|
+
**kwargs: Any,
|
|
419
422
|
) -> str:
|
|
420
423
|
"""Apply some text post-processing steps."""
|
|
421
424
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
@@ -434,28 +437,31 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
434
437
|
return res
|
|
435
438
|
|
|
436
439
|
@override
|
|
437
|
-
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
440
|
+
def serialize_bold(self, text: str, **kwargs: Any) -> str:
|
|
438
441
|
"""Hook for bold formatting serialization."""
|
|
439
442
|
return text
|
|
440
443
|
|
|
441
444
|
@override
|
|
442
|
-
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
445
|
+
def serialize_italic(self, text: str, **kwargs: Any) -> str:
|
|
443
446
|
"""Hook for italic formatting serialization."""
|
|
444
447
|
return text
|
|
445
448
|
|
|
446
449
|
@override
|
|
447
|
-
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
450
|
+
def serialize_underline(self, text: str, **kwargs: Any) -> str:
|
|
448
451
|
"""Hook for underline formatting serialization."""
|
|
449
452
|
return text
|
|
450
453
|
|
|
451
454
|
@override
|
|
452
|
-
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
455
|
+
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
|
|
453
456
|
"""Hook for strikethrough formatting serialization."""
|
|
454
457
|
return text
|
|
455
458
|
|
|
456
459
|
@override
|
|
457
460
|
def serialize_hyperlink(
|
|
458
|
-
self,
|
|
461
|
+
self,
|
|
462
|
+
text: str,
|
|
463
|
+
hyperlink: Union[AnyUrl, Path],
|
|
464
|
+
**kwargs: Any,
|
|
459
465
|
) -> str:
|
|
460
466
|
"""Hook for hyperlink serialization."""
|
|
461
467
|
return text
|
|
@@ -464,7 +470,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
464
470
|
def serialize_captions(
|
|
465
471
|
self,
|
|
466
472
|
item: FloatingItem,
|
|
467
|
-
**kwargs,
|
|
473
|
+
**kwargs: Any,
|
|
468
474
|
) -> SerializationResult:
|
|
469
475
|
"""Serialize the item's captions."""
|
|
470
476
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Define classes for Doctags serialization."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Dict, List, Optional, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
from typing_extensions import override
|
|
@@ -91,7 +91,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
91
91
|
item: TextItem,
|
|
92
92
|
doc_serializer: BaseDocSerializer,
|
|
93
93
|
doc: DoclingDocument,
|
|
94
|
-
**kwargs,
|
|
94
|
+
**kwargs: Any,
|
|
95
95
|
) -> SerializationResult:
|
|
96
96
|
"""Serializes the passed item."""
|
|
97
97
|
from docling_core.types.doc.document import SectionHeaderItem
|
|
@@ -154,7 +154,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
154
154
|
item: TableItem,
|
|
155
155
|
doc_serializer: BaseDocSerializer,
|
|
156
156
|
doc: DoclingDocument,
|
|
157
|
-
**kwargs,
|
|
157
|
+
**kwargs: Any,
|
|
158
158
|
) -> SerializationResult:
|
|
159
159
|
"""Serializes the passed item."""
|
|
160
160
|
params = DocTagsParams(**kwargs)
|
|
@@ -201,7 +201,7 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
201
201
|
item: PictureItem,
|
|
202
202
|
doc_serializer: BaseDocSerializer,
|
|
203
203
|
doc: DoclingDocument,
|
|
204
|
-
**kwargs,
|
|
204
|
+
**kwargs: Any,
|
|
205
205
|
) -> SerializationResult:
|
|
206
206
|
"""Serializes the passed item."""
|
|
207
207
|
params = DocTagsParams(**kwargs)
|
|
@@ -284,7 +284,7 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
284
284
|
item: KeyValueItem,
|
|
285
285
|
doc_serializer: "BaseDocSerializer",
|
|
286
286
|
doc: DoclingDocument,
|
|
287
|
-
**kwargs,
|
|
287
|
+
**kwargs: Any,
|
|
288
288
|
) -> SerializationResult:
|
|
289
289
|
"""Serializes the passed item."""
|
|
290
290
|
params = DocTagsParams(**kwargs)
|
|
@@ -356,7 +356,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
|
|
|
356
356
|
item: FormItem,
|
|
357
357
|
doc_serializer: "BaseDocSerializer",
|
|
358
358
|
doc: DoclingDocument,
|
|
359
|
-
**kwargs,
|
|
359
|
+
**kwargs: Any,
|
|
360
360
|
) -> SerializationResult:
|
|
361
361
|
"""Serializes the passed item."""
|
|
362
362
|
# TODO add actual implementation
|
|
@@ -378,7 +378,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
378
378
|
list_level: int = 0,
|
|
379
379
|
is_inline_scope: bool = False,
|
|
380
380
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
381
|
-
**kwargs,
|
|
381
|
+
**kwargs: Any,
|
|
382
382
|
) -> SerializationResult:
|
|
383
383
|
"""Serializes the passed item."""
|
|
384
384
|
my_visited = visited if visited is not None else set()
|
|
@@ -423,7 +423,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
423
423
|
doc: DoclingDocument,
|
|
424
424
|
list_level: int = 0,
|
|
425
425
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
426
|
-
**kwargs,
|
|
426
|
+
**kwargs: Any,
|
|
427
427
|
) -> SerializationResult:
|
|
428
428
|
"""Serializes the passed item."""
|
|
429
429
|
my_visited = visited if visited is not None else set()
|
|
@@ -454,7 +454,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
454
454
|
item: NodeItem,
|
|
455
455
|
doc_serializer: "BaseDocSerializer",
|
|
456
456
|
doc: DoclingDocument,
|
|
457
|
-
**kwargs,
|
|
457
|
+
**kwargs: Any,
|
|
458
458
|
) -> SerializationResult:
|
|
459
459
|
"""Serializes the passed item."""
|
|
460
460
|
return create_ser_result()
|
|
@@ -477,7 +477,10 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
477
477
|
|
|
478
478
|
@override
|
|
479
479
|
def serialize_doc(
|
|
480
|
-
self,
|
|
480
|
+
self,
|
|
481
|
+
*,
|
|
482
|
+
parts: list[SerializationResult],
|
|
483
|
+
**kwargs: Any,
|
|
481
484
|
) -> SerializationResult:
|
|
482
485
|
"""Serialize a document out of its pages."""
|
|
483
486
|
delim = _get_delim(params=self.params)
|
|
@@ -496,7 +499,7 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
496
499
|
def serialize_captions(
|
|
497
500
|
self,
|
|
498
501
|
item: FloatingItem,
|
|
499
|
-
**kwargs,
|
|
502
|
+
**kwargs: Any,
|
|
500
503
|
) -> SerializationResult:
|
|
501
504
|
"""Serialize the item's captions."""
|
|
502
505
|
params = DocTagsParams(**kwargs)
|
|
@@ -10,7 +10,7 @@ import logging
|
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from io import BytesIO
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Optional, Union
|
|
13
|
+
from typing import Any, Optional, Union
|
|
14
14
|
from urllib.parse import quote
|
|
15
15
|
from xml.etree.cElementTree import SubElement, tostring
|
|
16
16
|
from xml.sax.saxutils import unescape
|
|
@@ -120,7 +120,7 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
120
120
|
doc_serializer: BaseDocSerializer,
|
|
121
121
|
doc: DoclingDocument,
|
|
122
122
|
is_inline_scope: bool = False,
|
|
123
|
-
**kwargs,
|
|
123
|
+
**kwargs: Any,
|
|
124
124
|
) -> SerializationResult:
|
|
125
125
|
"""Serializes the passed text item to HTML."""
|
|
126
126
|
params = HTMLParams(**kwargs)
|
|
@@ -296,7 +296,7 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
296
296
|
item: TableItem,
|
|
297
297
|
doc_serializer: BaseDocSerializer,
|
|
298
298
|
doc: DoclingDocument,
|
|
299
|
-
**kwargs,
|
|
299
|
+
**kwargs: Any,
|
|
300
300
|
) -> SerializationResult:
|
|
301
301
|
"""Serializes the passed table item to HTML."""
|
|
302
302
|
nrows = item.data.num_rows
|
|
@@ -367,7 +367,7 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
367
367
|
item: PictureItem,
|
|
368
368
|
doc_serializer: BaseDocSerializer,
|
|
369
369
|
doc: DoclingDocument,
|
|
370
|
-
**kwargs,
|
|
370
|
+
**kwargs: Any,
|
|
371
371
|
) -> SerializationResult:
|
|
372
372
|
"""Export picture to HTML format."""
|
|
373
373
|
params = HTMLParams(**kwargs)
|
|
@@ -574,7 +574,7 @@ class HTMLKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
574
574
|
item: KeyValueItem,
|
|
575
575
|
doc_serializer: "BaseDocSerializer",
|
|
576
576
|
doc: DoclingDocument,
|
|
577
|
-
**kwargs,
|
|
577
|
+
**kwargs: Any,
|
|
578
578
|
) -> SerializationResult:
|
|
579
579
|
"""Serializes the passed key-value item to HTML."""
|
|
580
580
|
res_parts: list[SerializationResult] = []
|
|
@@ -611,7 +611,7 @@ class HTMLFormSerializer(BaseFormSerializer):
|
|
|
611
611
|
item: FormItem,
|
|
612
612
|
doc_serializer: "BaseDocSerializer",
|
|
613
613
|
doc: DoclingDocument,
|
|
614
|
-
**kwargs,
|
|
614
|
+
**kwargs: Any,
|
|
615
615
|
) -> SerializationResult:
|
|
616
616
|
"""Serializes the passed form item to HTML."""
|
|
617
617
|
res_parts: list[SerializationResult] = []
|
|
@@ -651,7 +651,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
651
651
|
list_level: int = 0,
|
|
652
652
|
is_inline_scope: bool = False,
|
|
653
653
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
654
|
-
**kwargs,
|
|
654
|
+
**kwargs: Any,
|
|
655
655
|
) -> SerializationResult:
|
|
656
656
|
"""Serializes a list to HTML."""
|
|
657
657
|
my_visited: set[str] = visited if visited is not None else set()
|
|
@@ -699,7 +699,7 @@ class HTMLInlineSerializer(BaseInlineSerializer):
|
|
|
699
699
|
doc: DoclingDocument,
|
|
700
700
|
list_level: int = 0,
|
|
701
701
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
702
|
-
**kwargs,
|
|
702
|
+
**kwargs: Any,
|
|
703
703
|
) -> SerializationResult:
|
|
704
704
|
"""Serializes an inline group to HTML."""
|
|
705
705
|
my_visited: set[str] = visited if visited is not None else set()
|
|
@@ -733,7 +733,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
733
733
|
item: NodeItem,
|
|
734
734
|
doc_serializer: "BaseDocSerializer",
|
|
735
735
|
doc: DoclingDocument,
|
|
736
|
-
**kwargs,
|
|
736
|
+
**kwargs: Any,
|
|
737
737
|
) -> SerializationResult:
|
|
738
738
|
"""Fallback serializer for items not handled by other serializers."""
|
|
739
739
|
if isinstance(item, DocItem):
|
|
@@ -762,35 +762,40 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
762
762
|
params: HTMLParams = HTMLParams()
|
|
763
763
|
|
|
764
764
|
@override
|
|
765
|
-
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
765
|
+
def serialize_bold(self, text: str, **kwargs: Any) -> str:
|
|
766
766
|
"""Apply HTML-specific bold serialization."""
|
|
767
767
|
return f"<strong>{text}</strong>"
|
|
768
768
|
|
|
769
769
|
@override
|
|
770
|
-
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
770
|
+
def serialize_italic(self, text: str, **kwargs: Any) -> str:
|
|
771
771
|
"""Apply HTML-specific italic serialization."""
|
|
772
772
|
return f"<em>{text}</em>"
|
|
773
773
|
|
|
774
774
|
@override
|
|
775
|
-
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
775
|
+
def serialize_underline(self, text: str, **kwargs: Any) -> str:
|
|
776
776
|
"""Apply HTML-specific underline serialization."""
|
|
777
777
|
return f"<u>{text}</u>"
|
|
778
778
|
|
|
779
779
|
@override
|
|
780
|
-
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
780
|
+
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
|
|
781
781
|
"""Apply HTML-specific strikethrough serialization."""
|
|
782
782
|
return f"<del>{text}</del>"
|
|
783
783
|
|
|
784
784
|
@override
|
|
785
785
|
def serialize_hyperlink(
|
|
786
|
-
self,
|
|
786
|
+
self,
|
|
787
|
+
text: str,
|
|
788
|
+
hyperlink: Union[AnyUrl, Path],
|
|
789
|
+
**kwargs: Any,
|
|
787
790
|
) -> str:
|
|
788
791
|
"""Apply HTML-specific hyperlink serialization."""
|
|
789
792
|
return f'<a href="{str(hyperlink)}">{text}</a>'
|
|
790
793
|
|
|
791
794
|
@override
|
|
792
795
|
def serialize_doc(
|
|
793
|
-
self,
|
|
796
|
+
self,
|
|
797
|
+
parts: list[SerializationResult],
|
|
798
|
+
**kwargs: Any,
|
|
794
799
|
) -> SerializationResult:
|
|
795
800
|
"""Serialize a document out of its pages."""
|
|
796
801
|
# Create HTML structure
|
|
@@ -895,7 +900,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
895
900
|
self,
|
|
896
901
|
item: FloatingItem,
|
|
897
902
|
tag: str = "figcaption",
|
|
898
|
-
**kwargs,
|
|
903
|
+
**kwargs: Any,
|
|
899
904
|
) -> SerializationResult:
|
|
900
905
|
"""Serialize the item's captions."""
|
|
901
906
|
params = self.params.merge_with_patch(patch=kwargs)
|
{docling_core-2.27.0 → docling_core-2.28.1}/docling_core/experimental/serializer/markdown.py
RENAMED
|
@@ -8,7 +8,7 @@ import html
|
|
|
8
8
|
import re
|
|
9
9
|
import textwrap
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Optional, Union
|
|
11
|
+
from typing import Any, Optional, Union
|
|
12
12
|
|
|
13
13
|
from pydantic import AnyUrl, BaseModel, PositiveInt
|
|
14
14
|
from tabulate import tabulate
|
|
@@ -82,7 +82,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
82
82
|
doc_serializer: BaseDocSerializer,
|
|
83
83
|
doc: DoclingDocument,
|
|
84
84
|
is_inline_scope: bool = False,
|
|
85
|
-
**kwargs,
|
|
85
|
+
**kwargs: Any,
|
|
86
86
|
) -> SerializationResult:
|
|
87
87
|
"""Serializes the passed item."""
|
|
88
88
|
params = MarkdownParams(**kwargs)
|
|
@@ -143,7 +143,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
143
143
|
item: TableItem,
|
|
144
144
|
doc_serializer: BaseDocSerializer,
|
|
145
145
|
doc: DoclingDocument,
|
|
146
|
-
**kwargs,
|
|
146
|
+
**kwargs: Any,
|
|
147
147
|
) -> SerializationResult:
|
|
148
148
|
"""Serializes the passed item."""
|
|
149
149
|
res_parts: list[SerializationResult] = []
|
|
@@ -195,7 +195,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
195
195
|
item: PictureItem,
|
|
196
196
|
doc_serializer: BaseDocSerializer,
|
|
197
197
|
doc: DoclingDocument,
|
|
198
|
-
**kwargs,
|
|
198
|
+
**kwargs: Any,
|
|
199
199
|
) -> SerializationResult:
|
|
200
200
|
"""Serializes the passed item."""
|
|
201
201
|
params = MarkdownParams(**kwargs)
|
|
@@ -246,7 +246,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
246
246
|
doc: DoclingDocument,
|
|
247
247
|
image_mode: ImageRefMode,
|
|
248
248
|
image_placeholder: str,
|
|
249
|
-
**kwargs,
|
|
249
|
+
**kwargs: Any,
|
|
250
250
|
) -> SerializationResult:
|
|
251
251
|
error_response = (
|
|
252
252
|
"<!-- 🖼️❌ Image not available. "
|
|
@@ -298,7 +298,7 @@ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
298
298
|
item: KeyValueItem,
|
|
299
299
|
doc_serializer: "BaseDocSerializer",
|
|
300
300
|
doc: DoclingDocument,
|
|
301
|
-
**kwargs,
|
|
301
|
+
**kwargs: Any,
|
|
302
302
|
) -> SerializationResult:
|
|
303
303
|
"""Serializes the passed item."""
|
|
304
304
|
# TODO add actual implementation
|
|
@@ -321,7 +321,7 @@ class MarkdownFormSerializer(BaseFormSerializer):
|
|
|
321
321
|
item: FormItem,
|
|
322
322
|
doc_serializer: "BaseDocSerializer",
|
|
323
323
|
doc: DoclingDocument,
|
|
324
|
-
**kwargs,
|
|
324
|
+
**kwargs: Any,
|
|
325
325
|
) -> SerializationResult:
|
|
326
326
|
"""Serializes the passed item."""
|
|
327
327
|
# TODO add actual implementation
|
|
@@ -347,7 +347,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
347
347
|
list_level: int = 0,
|
|
348
348
|
is_inline_scope: bool = False,
|
|
349
349
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
350
|
-
**kwargs,
|
|
350
|
+
**kwargs: Any,
|
|
351
351
|
) -> SerializationResult:
|
|
352
352
|
"""Serializes the passed item."""
|
|
353
353
|
params = MarkdownParams(**kwargs)
|
|
@@ -400,7 +400,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
|
400
400
|
doc: DoclingDocument,
|
|
401
401
|
list_level: int = 0,
|
|
402
402
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
403
|
-
**kwargs,
|
|
403
|
+
**kwargs: Any,
|
|
404
404
|
) -> SerializationResult:
|
|
405
405
|
"""Serializes the passed item."""
|
|
406
406
|
my_visited = visited if visited is not None else set()
|
|
@@ -425,7 +425,7 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
|
425
425
|
item: NodeItem,
|
|
426
426
|
doc_serializer: "BaseDocSerializer",
|
|
427
427
|
doc: DoclingDocument,
|
|
428
|
-
**kwargs,
|
|
428
|
+
**kwargs: Any,
|
|
429
429
|
) -> SerializationResult:
|
|
430
430
|
"""Serializes the passed item."""
|
|
431
431
|
if isinstance(item, DocItem):
|
|
@@ -453,22 +453,27 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
453
453
|
params: MarkdownParams = MarkdownParams()
|
|
454
454
|
|
|
455
455
|
@override
|
|
456
|
-
def serialize_bold(self, text: str, **kwargs):
|
|
456
|
+
def serialize_bold(self, text: str, **kwargs: Any):
|
|
457
457
|
"""Apply Markdown-specific bold serialization."""
|
|
458
458
|
return f"**{text}**"
|
|
459
459
|
|
|
460
460
|
@override
|
|
461
|
-
def serialize_italic(self, text: str, **kwargs):
|
|
461
|
+
def serialize_italic(self, text: str, **kwargs: Any):
|
|
462
462
|
"""Apply Markdown-specific italic serialization."""
|
|
463
463
|
return f"*{text}*"
|
|
464
464
|
|
|
465
465
|
@override
|
|
466
|
-
def serialize_strikethrough(self, text: str, **kwargs):
|
|
466
|
+
def serialize_strikethrough(self, text: str, **kwargs: Any):
|
|
467
467
|
"""Apply Markdown-specific strikethrough serialization."""
|
|
468
468
|
return f"~~{text}~~"
|
|
469
469
|
|
|
470
470
|
@override
|
|
471
|
-
def serialize_hyperlink(
|
|
471
|
+
def serialize_hyperlink(
|
|
472
|
+
self,
|
|
473
|
+
text: str,
|
|
474
|
+
hyperlink: Union[AnyUrl, Path],
|
|
475
|
+
**kwargs: Any,
|
|
476
|
+
):
|
|
472
477
|
"""Apply Markdown-specific hyperlink serialization."""
|
|
473
478
|
return f"[{text}]({str(hyperlink)})"
|
|
474
479
|
|
|
@@ -505,7 +510,7 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
505
510
|
escape_underscores: bool = True,
|
|
506
511
|
formatting: Optional[Formatting] = None,
|
|
507
512
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
508
|
-
**kwargs,
|
|
513
|
+
**kwargs: Any,
|
|
509
514
|
) -> str:
|
|
510
515
|
"""Apply some text post-processing steps."""
|
|
511
516
|
res = text
|
|
@@ -523,7 +528,10 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
523
528
|
|
|
524
529
|
@override
|
|
525
530
|
def serialize_doc(
|
|
526
|
-
self,
|
|
531
|
+
self,
|
|
532
|
+
*,
|
|
533
|
+
parts: list[SerializationResult],
|
|
534
|
+
**kwargs: Any,
|
|
527
535
|
) -> SerializationResult:
|
|
528
536
|
"""Serialize a document out of its parts."""
|
|
529
537
|
text_res = "\n\n".join([p.text for p in parts if p.text])
|