docling-core 2.26.4__tar.gz → 2.28.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.26.4 → docling_core-2.28.0}/PKG-INFO +4 -2
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/base.py +25 -19
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/common.py +17 -11
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/doctags.py +14 -11
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/html.py +48 -17
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/markdown.py +24 -16
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/hybrid_chunker.py +49 -31
- docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/__init__.py +1 -0
- docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/base.py +25 -0
- docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/huggingface.py +70 -0
- docling_core-2.28.0/docling_core/transforms/chunker/tokenizer/openai.py +34 -0
- docling_core-2.28.0/docling_core/transforms/visualizer/__init__.py +1 -0
- docling_core-2.28.0/docling_core/transforms/visualizer/base.py +23 -0
- docling_core-2.28.0/docling_core/transforms/visualizer/layout_visualizer.py +201 -0
- docling_core-2.28.0/docling_core/transforms/visualizer/reading_order_visualizer.py +149 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/document.py +26 -2
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/labels.py +2 -1
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/page.py +4 -3
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/document.py +2 -2
- {docling_core-2.26.4 → docling_core-2.28.0}/pyproject.toml +11 -2
- {docling_core-2.26.4 → docling_core-2.28.0}/LICENSE +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/README.md +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/experimental/serializer/html_styles.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/py.typed +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/search/package.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.26.4 → docling_core-2.28.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.28.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://github.com/docling-project
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Classifier: Typing :: Typed
|
|
28
28
|
Provides-Extra: chunking
|
|
29
|
+
Provides-Extra: chunking-openai
|
|
29
30
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
31
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
32
|
Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
|
|
@@ -33,8 +34,9 @@ Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
|
33
34
|
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
|
34
35
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
35
36
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
36
|
-
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
|
|
37
|
+
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking" or extra == "chunking-openai"
|
|
37
38
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
39
|
+
Requires-Dist: tiktoken (>=0.9.0,<0.10.0) ; extra == "chunking-openai"
|
|
38
40
|
Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
|
|
39
41
|
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
|
40
42
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Define base classes for serialization."""
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Optional, Union
|
|
9
|
+
from typing import Any, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import AnyUrl, BaseModel
|
|
12
12
|
|
|
@@ -51,7 +51,7 @@ class BaseTextSerializer(ABC):
|
|
|
51
51
|
item: TextItem,
|
|
52
52
|
doc_serializer: "BaseDocSerializer",
|
|
53
53
|
doc: DoclingDocument,
|
|
54
|
-
**kwargs,
|
|
54
|
+
**kwargs: Any,
|
|
55
55
|
) -> SerializationResult:
|
|
56
56
|
"""Serializes the passed item."""
|
|
57
57
|
...
|
|
@@ -67,7 +67,7 @@ class BaseTableSerializer(ABC):
|
|
|
67
67
|
item: TableItem,
|
|
68
68
|
doc_serializer: "BaseDocSerializer",
|
|
69
69
|
doc: DoclingDocument,
|
|
70
|
-
**kwargs,
|
|
70
|
+
**kwargs: Any,
|
|
71
71
|
) -> SerializationResult:
|
|
72
72
|
"""Serializes the passed item."""
|
|
73
73
|
...
|
|
@@ -83,7 +83,7 @@ class BasePictureSerializer(ABC):
|
|
|
83
83
|
item: PictureItem,
|
|
84
84
|
doc_serializer: "BaseDocSerializer",
|
|
85
85
|
doc: DoclingDocument,
|
|
86
|
-
**kwargs,
|
|
86
|
+
**kwargs: Any,
|
|
87
87
|
) -> SerializationResult:
|
|
88
88
|
"""Serializes the passed item."""
|
|
89
89
|
...
|
|
@@ -99,7 +99,7 @@ class BaseKeyValueSerializer(ABC):
|
|
|
99
99
|
item: KeyValueItem,
|
|
100
100
|
doc_serializer: "BaseDocSerializer",
|
|
101
101
|
doc: DoclingDocument,
|
|
102
|
-
**kwargs,
|
|
102
|
+
**kwargs: Any,
|
|
103
103
|
) -> SerializationResult:
|
|
104
104
|
"""Serializes the passed item."""
|
|
105
105
|
...
|
|
@@ -115,7 +115,7 @@ class BaseFormSerializer(ABC):
|
|
|
115
115
|
item: FormItem,
|
|
116
116
|
doc_serializer: "BaseDocSerializer",
|
|
117
117
|
doc: DoclingDocument,
|
|
118
|
-
**kwargs,
|
|
118
|
+
**kwargs: Any,
|
|
119
119
|
) -> SerializationResult:
|
|
120
120
|
"""Serializes the passed item."""
|
|
121
121
|
...
|
|
@@ -131,7 +131,7 @@ class BaseListSerializer(ABC):
|
|
|
131
131
|
item: Union[UnorderedList, OrderedList],
|
|
132
132
|
doc_serializer: "BaseDocSerializer",
|
|
133
133
|
doc: DoclingDocument,
|
|
134
|
-
**kwargs,
|
|
134
|
+
**kwargs: Any,
|
|
135
135
|
) -> SerializationResult:
|
|
136
136
|
"""Serializes the passed item."""
|
|
137
137
|
...
|
|
@@ -147,7 +147,7 @@ class BaseInlineSerializer(ABC):
|
|
|
147
147
|
item: InlineGroup,
|
|
148
148
|
doc_serializer: "BaseDocSerializer",
|
|
149
149
|
doc: DoclingDocument,
|
|
150
|
-
**kwargs,
|
|
150
|
+
**kwargs: Any,
|
|
151
151
|
) -> SerializationResult:
|
|
152
152
|
"""Serializes the passed item."""
|
|
153
153
|
...
|
|
@@ -163,7 +163,7 @@ class BaseFallbackSerializer(ABC):
|
|
|
163
163
|
item: NodeItem,
|
|
164
164
|
doc_serializer: "BaseDocSerializer",
|
|
165
165
|
doc: DoclingDocument,
|
|
166
|
-
**kwargs,
|
|
166
|
+
**kwargs: Any,
|
|
167
167
|
) -> SerializationResult:
|
|
168
168
|
"""Serializes the passed item."""
|
|
169
169
|
...
|
|
@@ -174,34 +174,40 @@ class BaseDocSerializer(ABC):
|
|
|
174
174
|
|
|
175
175
|
@abstractmethod
|
|
176
176
|
def serialize(
|
|
177
|
-
self,
|
|
177
|
+
self,
|
|
178
|
+
*,
|
|
179
|
+
item: Optional[NodeItem] = None,
|
|
180
|
+
**kwargs: Any,
|
|
178
181
|
) -> SerializationResult:
|
|
179
182
|
"""Run the serialization."""
|
|
180
183
|
...
|
|
181
184
|
|
|
182
185
|
@abstractmethod
|
|
183
|
-
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
186
|
+
def serialize_bold(self, text: str, **kwargs: Any) -> str:
|
|
184
187
|
"""Hook for bold formatting serialization."""
|
|
185
188
|
...
|
|
186
189
|
|
|
187
190
|
@abstractmethod
|
|
188
|
-
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
191
|
+
def serialize_italic(self, text: str, **kwargs: Any) -> str:
|
|
189
192
|
"""Hook for italic formatting serialization."""
|
|
190
193
|
...
|
|
191
194
|
|
|
192
195
|
@abstractmethod
|
|
193
|
-
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
196
|
+
def serialize_underline(self, text: str, **kwargs: Any) -> str:
|
|
194
197
|
"""Hook for underline formatting serialization."""
|
|
195
198
|
...
|
|
196
199
|
|
|
197
200
|
@abstractmethod
|
|
198
|
-
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
201
|
+
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
|
|
199
202
|
"""Hook for strikethrough formatting serialization."""
|
|
200
203
|
...
|
|
201
204
|
|
|
202
205
|
@abstractmethod
|
|
203
206
|
def serialize_hyperlink(
|
|
204
|
-
self,
|
|
207
|
+
self,
|
|
208
|
+
text: str,
|
|
209
|
+
hyperlink: Union[AnyUrl, Path],
|
|
210
|
+
**kwargs: Any,
|
|
205
211
|
) -> str:
|
|
206
212
|
"""Hook for hyperlink serialization."""
|
|
207
213
|
...
|
|
@@ -210,7 +216,7 @@ class BaseDocSerializer(ABC):
|
|
|
210
216
|
def get_parts(
|
|
211
217
|
self,
|
|
212
218
|
item: Optional[NodeItem] = None,
|
|
213
|
-
**kwargs,
|
|
219
|
+
**kwargs: Any,
|
|
214
220
|
) -> list[SerializationResult]:
|
|
215
221
|
"""Get the components to be combined for serializing this node."""
|
|
216
222
|
...
|
|
@@ -219,7 +225,7 @@ class BaseDocSerializer(ABC):
|
|
|
219
225
|
def post_process(
|
|
220
226
|
self,
|
|
221
227
|
text: str,
|
|
222
|
-
**kwargs,
|
|
228
|
+
**kwargs: Any,
|
|
223
229
|
) -> str:
|
|
224
230
|
"""Apply some text post-processing steps."""
|
|
225
231
|
...
|
|
@@ -228,13 +234,13 @@ class BaseDocSerializer(ABC):
|
|
|
228
234
|
def serialize_captions(
|
|
229
235
|
self,
|
|
230
236
|
item: FloatingItem,
|
|
231
|
-
**kwargs,
|
|
237
|
+
**kwargs: Any,
|
|
232
238
|
) -> SerializationResult:
|
|
233
239
|
"""Serialize the item's captions."""
|
|
234
240
|
...
|
|
235
241
|
|
|
236
242
|
@abstractmethod
|
|
237
|
-
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
243
|
+
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
238
244
|
"""Get references to excluded items."""
|
|
239
245
|
...
|
|
240
246
|
|
|
@@ -214,7 +214,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
214
214
|
return refs
|
|
215
215
|
|
|
216
216
|
@override
|
|
217
|
-
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
217
|
+
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
218
218
|
"""References to excluded items."""
|
|
219
219
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
220
220
|
params_json = params.model_dump_json()
|
|
@@ -252,7 +252,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
252
252
|
|
|
253
253
|
@abstractmethod
|
|
254
254
|
def serialize_doc(
|
|
255
|
-
self,
|
|
255
|
+
self,
|
|
256
|
+
*,
|
|
257
|
+
parts: list[SerializationResult],
|
|
258
|
+
**kwargs: Any,
|
|
256
259
|
) -> SerializationResult:
|
|
257
260
|
"""Serialize a document out of its pages."""
|
|
258
261
|
...
|
|
@@ -271,7 +274,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
271
274
|
list_level: int = 0,
|
|
272
275
|
is_inline_scope: bool = False,
|
|
273
276
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
274
|
-
**kwargs,
|
|
277
|
+
**kwargs: Any,
|
|
275
278
|
) -> SerializationResult:
|
|
276
279
|
"""Serialize a given node."""
|
|
277
280
|
my_visited: set[str] = visited if visited is not None else set()
|
|
@@ -380,7 +383,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
380
383
|
list_level: int = 0,
|
|
381
384
|
is_inline_scope: bool = False,
|
|
382
385
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
383
|
-
**kwargs,
|
|
386
|
+
**kwargs: Any,
|
|
384
387
|
) -> list[SerializationResult]:
|
|
385
388
|
"""Get the components to be combined for serializing this node."""
|
|
386
389
|
parts: list[SerializationResult] = []
|
|
@@ -415,7 +418,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
415
418
|
*,
|
|
416
419
|
formatting: Optional[Formatting] = None,
|
|
417
420
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
418
|
-
**kwargs,
|
|
421
|
+
**kwargs: Any,
|
|
419
422
|
) -> str:
|
|
420
423
|
"""Apply some text post-processing steps."""
|
|
421
424
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
@@ -434,28 +437,31 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
434
437
|
return res
|
|
435
438
|
|
|
436
439
|
@override
|
|
437
|
-
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
440
|
+
def serialize_bold(self, text: str, **kwargs: Any) -> str:
|
|
438
441
|
"""Hook for bold formatting serialization."""
|
|
439
442
|
return text
|
|
440
443
|
|
|
441
444
|
@override
|
|
442
|
-
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
445
|
+
def serialize_italic(self, text: str, **kwargs: Any) -> str:
|
|
443
446
|
"""Hook for italic formatting serialization."""
|
|
444
447
|
return text
|
|
445
448
|
|
|
446
449
|
@override
|
|
447
|
-
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
450
|
+
def serialize_underline(self, text: str, **kwargs: Any) -> str:
|
|
448
451
|
"""Hook for underline formatting serialization."""
|
|
449
452
|
return text
|
|
450
453
|
|
|
451
454
|
@override
|
|
452
|
-
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
455
|
+
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
|
|
453
456
|
"""Hook for strikethrough formatting serialization."""
|
|
454
457
|
return text
|
|
455
458
|
|
|
456
459
|
@override
|
|
457
460
|
def serialize_hyperlink(
|
|
458
|
-
self,
|
|
461
|
+
self,
|
|
462
|
+
text: str,
|
|
463
|
+
hyperlink: Union[AnyUrl, Path],
|
|
464
|
+
**kwargs: Any,
|
|
459
465
|
) -> str:
|
|
460
466
|
"""Hook for hyperlink serialization."""
|
|
461
467
|
return text
|
|
@@ -464,7 +470,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
464
470
|
def serialize_captions(
|
|
465
471
|
self,
|
|
466
472
|
item: FloatingItem,
|
|
467
|
-
**kwargs,
|
|
473
|
+
**kwargs: Any,
|
|
468
474
|
) -> SerializationResult:
|
|
469
475
|
"""Serialize the item's captions."""
|
|
470
476
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Define classes for Doctags serialization."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Dict, List, Optional, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
from typing_extensions import override
|
|
@@ -91,7 +91,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
91
91
|
item: TextItem,
|
|
92
92
|
doc_serializer: BaseDocSerializer,
|
|
93
93
|
doc: DoclingDocument,
|
|
94
|
-
**kwargs,
|
|
94
|
+
**kwargs: Any,
|
|
95
95
|
) -> SerializationResult:
|
|
96
96
|
"""Serializes the passed item."""
|
|
97
97
|
from docling_core.types.doc.document import SectionHeaderItem
|
|
@@ -154,7 +154,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
154
154
|
item: TableItem,
|
|
155
155
|
doc_serializer: BaseDocSerializer,
|
|
156
156
|
doc: DoclingDocument,
|
|
157
|
-
**kwargs,
|
|
157
|
+
**kwargs: Any,
|
|
158
158
|
) -> SerializationResult:
|
|
159
159
|
"""Serializes the passed item."""
|
|
160
160
|
params = DocTagsParams(**kwargs)
|
|
@@ -201,7 +201,7 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
201
201
|
item: PictureItem,
|
|
202
202
|
doc_serializer: BaseDocSerializer,
|
|
203
203
|
doc: DoclingDocument,
|
|
204
|
-
**kwargs,
|
|
204
|
+
**kwargs: Any,
|
|
205
205
|
) -> SerializationResult:
|
|
206
206
|
"""Serializes the passed item."""
|
|
207
207
|
params = DocTagsParams(**kwargs)
|
|
@@ -284,7 +284,7 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
284
284
|
item: KeyValueItem,
|
|
285
285
|
doc_serializer: "BaseDocSerializer",
|
|
286
286
|
doc: DoclingDocument,
|
|
287
|
-
**kwargs,
|
|
287
|
+
**kwargs: Any,
|
|
288
288
|
) -> SerializationResult:
|
|
289
289
|
"""Serializes the passed item."""
|
|
290
290
|
params = DocTagsParams(**kwargs)
|
|
@@ -356,7 +356,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
|
|
|
356
356
|
item: FormItem,
|
|
357
357
|
doc_serializer: "BaseDocSerializer",
|
|
358
358
|
doc: DoclingDocument,
|
|
359
|
-
**kwargs,
|
|
359
|
+
**kwargs: Any,
|
|
360
360
|
) -> SerializationResult:
|
|
361
361
|
"""Serializes the passed item."""
|
|
362
362
|
# TODO add actual implementation
|
|
@@ -378,7 +378,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
378
378
|
list_level: int = 0,
|
|
379
379
|
is_inline_scope: bool = False,
|
|
380
380
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
381
|
-
**kwargs,
|
|
381
|
+
**kwargs: Any,
|
|
382
382
|
) -> SerializationResult:
|
|
383
383
|
"""Serializes the passed item."""
|
|
384
384
|
my_visited = visited if visited is not None else set()
|
|
@@ -423,7 +423,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
423
423
|
doc: DoclingDocument,
|
|
424
424
|
list_level: int = 0,
|
|
425
425
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
426
|
-
**kwargs,
|
|
426
|
+
**kwargs: Any,
|
|
427
427
|
) -> SerializationResult:
|
|
428
428
|
"""Serializes the passed item."""
|
|
429
429
|
my_visited = visited if visited is not None else set()
|
|
@@ -454,7 +454,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
454
454
|
item: NodeItem,
|
|
455
455
|
doc_serializer: "BaseDocSerializer",
|
|
456
456
|
doc: DoclingDocument,
|
|
457
|
-
**kwargs,
|
|
457
|
+
**kwargs: Any,
|
|
458
458
|
) -> SerializationResult:
|
|
459
459
|
"""Serializes the passed item."""
|
|
460
460
|
return create_ser_result()
|
|
@@ -477,7 +477,10 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
477
477
|
|
|
478
478
|
@override
|
|
479
479
|
def serialize_doc(
|
|
480
|
-
self,
|
|
480
|
+
self,
|
|
481
|
+
*,
|
|
482
|
+
parts: list[SerializationResult],
|
|
483
|
+
**kwargs: Any,
|
|
481
484
|
) -> SerializationResult:
|
|
482
485
|
"""Serialize a document out of its pages."""
|
|
483
486
|
delim = _get_delim(params=self.params)
|
|
@@ -496,7 +499,7 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
496
499
|
def serialize_captions(
|
|
497
500
|
self,
|
|
498
501
|
item: FloatingItem,
|
|
499
|
-
**kwargs,
|
|
502
|
+
**kwargs: Any,
|
|
500
503
|
) -> SerializationResult:
|
|
501
504
|
"""Serialize the item's captions."""
|
|
502
505
|
params = DocTagsParams(**kwargs)
|
|
@@ -10,7 +10,7 @@ import logging
|
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from io import BytesIO
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Optional, Union
|
|
13
|
+
from typing import Any, Optional, Union
|
|
14
14
|
from urllib.parse import quote
|
|
15
15
|
from xml.etree.cElementTree import SubElement, tostring
|
|
16
16
|
from xml.sax.saxutils import unescape
|
|
@@ -57,6 +57,7 @@ from docling_core.types.doc.document import (
|
|
|
57
57
|
NodeItem,
|
|
58
58
|
OrderedList,
|
|
59
59
|
PictureItem,
|
|
60
|
+
PictureTabularChartData,
|
|
60
61
|
SectionHeaderItem,
|
|
61
62
|
TableCell,
|
|
62
63
|
TableItem,
|
|
@@ -104,6 +105,9 @@ class HTMLParams(CommonParams):
|
|
|
104
105
|
# Allow for different output styles
|
|
105
106
|
output_style: HTMLOutputStyle = HTMLOutputStyle.SINGLE_COLUMN
|
|
106
107
|
|
|
108
|
+
# Enable charts to be printed into HTML as tables
|
|
109
|
+
enable_chart_tables: bool = True
|
|
110
|
+
|
|
107
111
|
|
|
108
112
|
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
109
113
|
"""HTML-specific text item serializer."""
|
|
@@ -116,7 +120,7 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
116
120
|
doc_serializer: BaseDocSerializer,
|
|
117
121
|
doc: DoclingDocument,
|
|
118
122
|
is_inline_scope: bool = False,
|
|
119
|
-
**kwargs,
|
|
123
|
+
**kwargs: Any,
|
|
120
124
|
) -> SerializationResult:
|
|
121
125
|
"""Serializes the passed text item to HTML."""
|
|
122
126
|
params = HTMLParams(**kwargs)
|
|
@@ -292,7 +296,7 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
292
296
|
item: TableItem,
|
|
293
297
|
doc_serializer: BaseDocSerializer,
|
|
294
298
|
doc: DoclingDocument,
|
|
295
|
-
**kwargs,
|
|
299
|
+
**kwargs: Any,
|
|
296
300
|
) -> SerializationResult:
|
|
297
301
|
"""Serializes the passed table item to HTML."""
|
|
298
302
|
nrows = item.data.num_rows
|
|
@@ -363,7 +367,7 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
363
367
|
item: PictureItem,
|
|
364
368
|
doc_serializer: BaseDocSerializer,
|
|
365
369
|
doc: DoclingDocument,
|
|
366
|
-
**kwargs,
|
|
370
|
+
**kwargs: Any,
|
|
367
371
|
) -> SerializationResult:
|
|
368
372
|
"""Export picture to HTML format."""
|
|
369
373
|
params = HTMLParams(**kwargs)
|
|
@@ -402,9 +406,28 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
402
406
|
and item.image.uri.scheme == "data"
|
|
403
407
|
):
|
|
404
408
|
img_text = f'<img src="{quote(str(item.image.uri))}">'
|
|
409
|
+
|
|
405
410
|
if img_text:
|
|
406
411
|
res_parts.append(create_ser_result(text=img_text, span_source=item))
|
|
407
412
|
|
|
413
|
+
if params.enable_chart_tables:
|
|
414
|
+
# Check if picture has attached PictureTabularChartData
|
|
415
|
+
tabular_chart_annotations = [
|
|
416
|
+
ann
|
|
417
|
+
for ann in item.annotations
|
|
418
|
+
if isinstance(ann, PictureTabularChartData)
|
|
419
|
+
]
|
|
420
|
+
if len(tabular_chart_annotations) > 0:
|
|
421
|
+
temp_doc = DoclingDocument(name="temp")
|
|
422
|
+
temp_table = temp_doc.add_table(
|
|
423
|
+
data=tabular_chart_annotations[0].chart_data
|
|
424
|
+
)
|
|
425
|
+
html_table_content = temp_table.export_to_html(temp_doc)
|
|
426
|
+
if len(html_table_content) > 0:
|
|
427
|
+
res_parts.append(
|
|
428
|
+
create_ser_result(text=html_table_content, span_source=item)
|
|
429
|
+
)
|
|
430
|
+
|
|
408
431
|
text_res = "".join([r.text for r in res_parts])
|
|
409
432
|
if text_res:
|
|
410
433
|
text_res = f"<figure>{text_res}</figure>"
|
|
@@ -551,7 +574,7 @@ class HTMLKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
551
574
|
item: KeyValueItem,
|
|
552
575
|
doc_serializer: "BaseDocSerializer",
|
|
553
576
|
doc: DoclingDocument,
|
|
554
|
-
**kwargs,
|
|
577
|
+
**kwargs: Any,
|
|
555
578
|
) -> SerializationResult:
|
|
556
579
|
"""Serializes the passed key-value item to HTML."""
|
|
557
580
|
res_parts: list[SerializationResult] = []
|
|
@@ -588,7 +611,7 @@ class HTMLFormSerializer(BaseFormSerializer):
|
|
|
588
611
|
item: FormItem,
|
|
589
612
|
doc_serializer: "BaseDocSerializer",
|
|
590
613
|
doc: DoclingDocument,
|
|
591
|
-
**kwargs,
|
|
614
|
+
**kwargs: Any,
|
|
592
615
|
) -> SerializationResult:
|
|
593
616
|
"""Serializes the passed form item to HTML."""
|
|
594
617
|
res_parts: list[SerializationResult] = []
|
|
@@ -628,7 +651,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
628
651
|
list_level: int = 0,
|
|
629
652
|
is_inline_scope: bool = False,
|
|
630
653
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
631
|
-
**kwargs,
|
|
654
|
+
**kwargs: Any,
|
|
632
655
|
) -> SerializationResult:
|
|
633
656
|
"""Serializes a list to HTML."""
|
|
634
657
|
my_visited: set[str] = visited if visited is not None else set()
|
|
@@ -676,7 +699,7 @@ class HTMLInlineSerializer(BaseInlineSerializer):
|
|
|
676
699
|
doc: DoclingDocument,
|
|
677
700
|
list_level: int = 0,
|
|
678
701
|
visited: Optional[set[str]] = None, # refs of visited items
|
|
679
|
-
**kwargs,
|
|
702
|
+
**kwargs: Any,
|
|
680
703
|
) -> SerializationResult:
|
|
681
704
|
"""Serializes an inline group to HTML."""
|
|
682
705
|
my_visited: set[str] = visited if visited is not None else set()
|
|
@@ -710,7 +733,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
710
733
|
item: NodeItem,
|
|
711
734
|
doc_serializer: "BaseDocSerializer",
|
|
712
735
|
doc: DoclingDocument,
|
|
713
|
-
**kwargs,
|
|
736
|
+
**kwargs: Any,
|
|
714
737
|
) -> SerializationResult:
|
|
715
738
|
"""Fallback serializer for items not handled by other serializers."""
|
|
716
739
|
if isinstance(item, DocItem):
|
|
@@ -739,35 +762,40 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
739
762
|
params: HTMLParams = HTMLParams()
|
|
740
763
|
|
|
741
764
|
@override
|
|
742
|
-
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
765
|
+
def serialize_bold(self, text: str, **kwargs: Any) -> str:
|
|
743
766
|
"""Apply HTML-specific bold serialization."""
|
|
744
767
|
return f"<strong>{text}</strong>"
|
|
745
768
|
|
|
746
769
|
@override
|
|
747
|
-
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
770
|
+
def serialize_italic(self, text: str, **kwargs: Any) -> str:
|
|
748
771
|
"""Apply HTML-specific italic serialization."""
|
|
749
772
|
return f"<em>{text}</em>"
|
|
750
773
|
|
|
751
774
|
@override
|
|
752
|
-
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
775
|
+
def serialize_underline(self, text: str, **kwargs: Any) -> str:
|
|
753
776
|
"""Apply HTML-specific underline serialization."""
|
|
754
777
|
return f"<u>{text}</u>"
|
|
755
778
|
|
|
756
779
|
@override
|
|
757
|
-
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
780
|
+
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
|
|
758
781
|
"""Apply HTML-specific strikethrough serialization."""
|
|
759
782
|
return f"<del>{text}</del>"
|
|
760
783
|
|
|
761
784
|
@override
|
|
762
785
|
def serialize_hyperlink(
|
|
763
|
-
self,
|
|
786
|
+
self,
|
|
787
|
+
text: str,
|
|
788
|
+
hyperlink: Union[AnyUrl, Path],
|
|
789
|
+
**kwargs: Any,
|
|
764
790
|
) -> str:
|
|
765
791
|
"""Apply HTML-specific hyperlink serialization."""
|
|
766
792
|
return f'<a href="{str(hyperlink)}">{text}</a>'
|
|
767
793
|
|
|
768
794
|
@override
|
|
769
795
|
def serialize_doc(
|
|
770
|
-
self,
|
|
796
|
+
self,
|
|
797
|
+
parts: list[SerializationResult],
|
|
798
|
+
**kwargs: Any,
|
|
771
799
|
) -> SerializationResult:
|
|
772
800
|
"""Serialize a document out of its pages."""
|
|
773
801
|
# Create HTML structure
|
|
@@ -779,6 +807,8 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
779
807
|
]
|
|
780
808
|
|
|
781
809
|
if self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
|
|
810
|
+
applicable_pages = self._get_applicable_pages()
|
|
811
|
+
|
|
782
812
|
html_content = "\n".join([p.text for p in parts if p.text])
|
|
783
813
|
next_page: Optional[int] = None
|
|
784
814
|
prev_full_match_end = 0
|
|
@@ -791,11 +821,12 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
791
821
|
# capture last page
|
|
792
822
|
if next_page is not None:
|
|
793
823
|
pages[next_page] = html_content[prev_full_match_end:]
|
|
824
|
+
elif applicable_pages is not None and len(applicable_pages) == 1:
|
|
825
|
+
pages[applicable_pages[0]] = html_content
|
|
794
826
|
|
|
795
827
|
html_parts.append("<table>")
|
|
796
828
|
html_parts.append("<tbody>")
|
|
797
829
|
|
|
798
|
-
applicable_pages = self._get_applicable_pages()
|
|
799
830
|
for page_no, page in pages.items():
|
|
800
831
|
|
|
801
832
|
if isinstance(page_no, int):
|
|
@@ -869,7 +900,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
869
900
|
self,
|
|
870
901
|
item: FloatingItem,
|
|
871
902
|
tag: str = "figcaption",
|
|
872
|
-
**kwargs,
|
|
903
|
+
**kwargs: Any,
|
|
873
904
|
) -> SerializationResult:
|
|
874
905
|
"""Serialize the item's captions."""
|
|
875
906
|
params = self.params.merge_with_patch(patch=kwargs)
|