docling-core 2.33.1__tar.gz → 2.34.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.33.1 → docling_core-2.34.0}/PKG-INFO +1 -1
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/serializer/base.py +34 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/serializer/common.py +37 -3
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/serializer/doctags.py +65 -6
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/serializer/html.py +61 -23
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/serializer/markdown.py +85 -18
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/doc/document.py +211 -53
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.33.1 → docling_core-2.34.0}/pyproject.toml +1 -1
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_docling_doc.py +34 -1
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_doctags_load.py +23 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_serialization.py +69 -5
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_visualization.py +12 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/LICENSE +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/README.md +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/py.typed +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/search/package.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/setup.cfg +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_collection.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_doc_base.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_page.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_search_meta.py +0 -0
- {docling_core-2.33.1 → docling_core-2.34.0}/test/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.34.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -202,6 +202,16 @@ class BaseDocSerializer(ABC):
|
|
|
202
202
|
"""Hook for strikethrough formatting serialization."""
|
|
203
203
|
...
|
|
204
204
|
|
|
205
|
+
@abstractmethod
|
|
206
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
207
|
+
"""Hook for subscript formatting serialization."""
|
|
208
|
+
...
|
|
209
|
+
|
|
210
|
+
@abstractmethod
|
|
211
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
212
|
+
"""Hook for superscript formatting serialization."""
|
|
213
|
+
...
|
|
214
|
+
|
|
205
215
|
@abstractmethod
|
|
206
216
|
def serialize_hyperlink(
|
|
207
217
|
self,
|
|
@@ -239,6 +249,15 @@ class BaseDocSerializer(ABC):
|
|
|
239
249
|
"""Serialize the item's captions."""
|
|
240
250
|
...
|
|
241
251
|
|
|
252
|
+
@abstractmethod
|
|
253
|
+
def serialize_annotations(
|
|
254
|
+
self,
|
|
255
|
+
item: DocItem,
|
|
256
|
+
**kwargs: Any,
|
|
257
|
+
) -> SerializationResult:
|
|
258
|
+
"""Serialize the item's annotations."""
|
|
259
|
+
...
|
|
260
|
+
|
|
242
261
|
@abstractmethod
|
|
243
262
|
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
244
263
|
"""Get references to excluded items."""
|
|
@@ -257,3 +276,18 @@ class BaseSerializerProvider(ABC):
|
|
|
257
276
|
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
|
|
258
277
|
"""Get a the associated serializer."""
|
|
259
278
|
...
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class BaseAnnotationSerializer(ABC):
|
|
282
|
+
"""Base class for annotation serializers."""
|
|
283
|
+
|
|
284
|
+
@abstractmethod
|
|
285
|
+
def serialize(
|
|
286
|
+
self,
|
|
287
|
+
*,
|
|
288
|
+
item: DocItem,
|
|
289
|
+
doc: DoclingDocument,
|
|
290
|
+
**kwargs: Any,
|
|
291
|
+
) -> SerializationResult:
|
|
292
|
+
"""Serializes the passed annotation."""
|
|
293
|
+
...
|
|
@@ -15,6 +15,7 @@ from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_fie
|
|
|
15
15
|
from typing_extensions import Self, override
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.serializer.base import (
|
|
18
|
+
BaseAnnotationSerializer,
|
|
18
19
|
BaseDocSerializer,
|
|
19
20
|
BaseFallbackSerializer,
|
|
20
21
|
BaseFormSerializer,
|
|
@@ -30,6 +31,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
30
31
|
from docling_core.types.doc.document import (
|
|
31
32
|
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
32
33
|
ContentLayer,
|
|
34
|
+
DescriptionAnnotation,
|
|
33
35
|
DocItem,
|
|
34
36
|
DoclingDocument,
|
|
35
37
|
FloatingItem,
|
|
@@ -41,9 +43,10 @@ from docling_core.types.doc.document import (
|
|
|
41
43
|
OrderedList,
|
|
42
44
|
PictureClassificationData,
|
|
43
45
|
PictureDataType,
|
|
44
|
-
PictureDescriptionData,
|
|
45
46
|
PictureItem,
|
|
46
47
|
PictureMoleculeData,
|
|
48
|
+
Script,
|
|
49
|
+
TableAnnotationType,
|
|
47
50
|
TableItem,
|
|
48
51
|
TextItem,
|
|
49
52
|
UnorderedList,
|
|
@@ -122,7 +125,9 @@ def _iterate_items(
|
|
|
122
125
|
yield item
|
|
123
126
|
|
|
124
127
|
|
|
125
|
-
def
|
|
128
|
+
def _get_annotation_text(
|
|
129
|
+
annotation: Union[PictureDataType, TableAnnotationType],
|
|
130
|
+
) -> Optional[str]:
|
|
126
131
|
result = None
|
|
127
132
|
if isinstance(annotation, PictureClassificationData):
|
|
128
133
|
predicted_class = (
|
|
@@ -132,7 +137,7 @@ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
|
|
|
132
137
|
)
|
|
133
138
|
if predicted_class is not None:
|
|
134
139
|
result = predicted_class.replace("_", " ")
|
|
135
|
-
elif isinstance(annotation,
|
|
140
|
+
elif isinstance(annotation, DescriptionAnnotation):
|
|
136
141
|
result = annotation.text
|
|
137
142
|
elif isinstance(annotation, PictureMoleculeData):
|
|
138
143
|
result = annotation.smi
|
|
@@ -211,6 +216,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
211
216
|
list_serializer: BaseListSerializer
|
|
212
217
|
inline_serializer: BaseInlineSerializer
|
|
213
218
|
|
|
219
|
+
annotation_serializer: BaseAnnotationSerializer
|
|
220
|
+
|
|
214
221
|
params: CommonParams = CommonParams()
|
|
215
222
|
|
|
216
223
|
_excluded_refs_cache: dict[str, set[str]] = {}
|
|
@@ -449,6 +456,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
449
456
|
res = self.serialize_underline(text=res)
|
|
450
457
|
if formatting.strikethrough:
|
|
451
458
|
res = self.serialize_strikethrough(text=res)
|
|
459
|
+
if formatting.script == Script.SUB:
|
|
460
|
+
res = self.serialize_subscript(text=res)
|
|
461
|
+
elif formatting.script == Script.SUPER:
|
|
462
|
+
res = self.serialize_superscript(text=res)
|
|
452
463
|
if params.include_hyperlinks and hyperlink:
|
|
453
464
|
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
|
|
454
465
|
return res
|
|
@@ -473,6 +484,16 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
473
484
|
"""Hook for strikethrough formatting serialization."""
|
|
474
485
|
return text
|
|
475
486
|
|
|
487
|
+
@override
|
|
488
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
489
|
+
"""Hook for subscript formatting serialization."""
|
|
490
|
+
return text
|
|
491
|
+
|
|
492
|
+
@override
|
|
493
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
494
|
+
"""Hook for superscript formatting serialization."""
|
|
495
|
+
return text
|
|
496
|
+
|
|
476
497
|
@override
|
|
477
498
|
def serialize_hyperlink(
|
|
478
499
|
self,
|
|
@@ -505,6 +526,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
505
526
|
text_res = ""
|
|
506
527
|
return create_ser_result(text=text_res, span_source=results)
|
|
507
528
|
|
|
529
|
+
@override
|
|
530
|
+
def serialize_annotations(
|
|
531
|
+
self,
|
|
532
|
+
item: DocItem,
|
|
533
|
+
**kwargs: Any,
|
|
534
|
+
) -> SerializationResult:
|
|
535
|
+
"""Serialize the item's annotations."""
|
|
536
|
+
return self.annotation_serializer.serialize(
|
|
537
|
+
item=item,
|
|
538
|
+
doc=self.doc,
|
|
539
|
+
**kwargs,
|
|
540
|
+
)
|
|
541
|
+
|
|
508
542
|
def _get_applicable_pages(self) -> Optional[list[int]]:
|
|
509
543
|
pages = {
|
|
510
544
|
item.prov[0].page_no: ...
|
|
@@ -7,6 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
9
|
from docling_core.transforms.serializer.base import (
|
|
10
|
+
BaseAnnotationSerializer,
|
|
10
11
|
BaseDocSerializer,
|
|
11
12
|
BaseFallbackSerializer,
|
|
12
13
|
BaseFormSerializer,
|
|
@@ -17,12 +18,14 @@ from docling_core.transforms.serializer.base import (
|
|
|
17
18
|
BaseTableSerializer,
|
|
18
19
|
BaseTextSerializer,
|
|
19
20
|
SerializationResult,
|
|
21
|
+
Span,
|
|
20
22
|
)
|
|
21
23
|
from docling_core.transforms.serializer.common import (
|
|
22
24
|
CommonParams,
|
|
23
25
|
DocSerializer,
|
|
24
26
|
create_ser_result,
|
|
25
27
|
)
|
|
28
|
+
from docling_core.types.doc.base import BoundingBox
|
|
26
29
|
from docling_core.types.doc.document import (
|
|
27
30
|
CodeItem,
|
|
28
31
|
DocItem,
|
|
@@ -38,6 +41,7 @@ from docling_core.types.doc.document import (
|
|
|
38
41
|
PictureItem,
|
|
39
42
|
PictureMoleculeData,
|
|
40
43
|
PictureTabularChartData,
|
|
44
|
+
ProvenanceItem,
|
|
41
45
|
TableItem,
|
|
42
46
|
TextItem,
|
|
43
47
|
UnorderedList,
|
|
@@ -414,6 +418,39 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
414
418
|
class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
415
419
|
"""DocTags-specific inline group serializer."""
|
|
416
420
|
|
|
421
|
+
def _get_inline_location_tags(
|
|
422
|
+
self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams
|
|
423
|
+
) -> SerializationResult:
|
|
424
|
+
|
|
425
|
+
prov: Optional[ProvenanceItem] = None
|
|
426
|
+
boxes: list[BoundingBox] = []
|
|
427
|
+
doc_items: list[DocItem] = []
|
|
428
|
+
for it, _ in doc.iterate_items(root=item):
|
|
429
|
+
if isinstance(it, DocItem):
|
|
430
|
+
for prov in it.prov:
|
|
431
|
+
boxes.append(prov.bbox)
|
|
432
|
+
doc_items.append(it)
|
|
433
|
+
if prov is None:
|
|
434
|
+
return create_ser_result()
|
|
435
|
+
|
|
436
|
+
bbox = BoundingBox.enclosing_bbox(boxes=boxes)
|
|
437
|
+
|
|
438
|
+
# using last seen prov as reference for page dims
|
|
439
|
+
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
440
|
+
|
|
441
|
+
loc_str = DocumentToken.get_location(
|
|
442
|
+
bbox=bbox.to_top_left_origin(page_h).as_tuple(),
|
|
443
|
+
page_w=page_w,
|
|
444
|
+
page_h=page_h,
|
|
445
|
+
xsize=params.xsize,
|
|
446
|
+
ysize=params.ysize,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
return SerializationResult(
|
|
450
|
+
text=loc_str,
|
|
451
|
+
spans=[Span(item=it) for it in doc_items],
|
|
452
|
+
)
|
|
453
|
+
|
|
417
454
|
@override
|
|
418
455
|
def serialize(
|
|
419
456
|
self,
|
|
@@ -428,12 +465,23 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
428
465
|
"""Serializes the passed item."""
|
|
429
466
|
my_visited = visited if visited is not None else set()
|
|
430
467
|
params = DocTagsParams(**kwargs)
|
|
431
|
-
parts =
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
468
|
+
parts: List[SerializationResult] = []
|
|
469
|
+
if params.add_location:
|
|
470
|
+
inline_loc_tags_ser_res = self._get_inline_location_tags(
|
|
471
|
+
doc=doc,
|
|
472
|
+
item=item,
|
|
473
|
+
params=params,
|
|
474
|
+
)
|
|
475
|
+
parts.append(inline_loc_tags_ser_res)
|
|
476
|
+
params.add_location = False # suppress children location serialization
|
|
477
|
+
parts.extend(
|
|
478
|
+
doc_serializer.get_parts(
|
|
479
|
+
item=item,
|
|
480
|
+
list_level=list_level,
|
|
481
|
+
is_inline_scope=True,
|
|
482
|
+
visited=my_visited,
|
|
483
|
+
**{**kwargs, **params.model_dump()},
|
|
484
|
+
)
|
|
437
485
|
)
|
|
438
486
|
wrap_tag = DocumentToken.INLINE.value
|
|
439
487
|
delim = _get_delim(params=params)
|
|
@@ -460,6 +508,15 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
460
508
|
return create_ser_result()
|
|
461
509
|
|
|
462
510
|
|
|
511
|
+
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
|
|
512
|
+
"""DocTags-specific annotation serializer."""
|
|
513
|
+
|
|
514
|
+
@override
|
|
515
|
+
def serialize(self, *, item: DocItem, **kwargs: Any) -> SerializationResult:
|
|
516
|
+
"""Serializes the item's annotations."""
|
|
517
|
+
return create_ser_result()
|
|
518
|
+
|
|
519
|
+
|
|
463
520
|
class DocTagsDocSerializer(DocSerializer):
|
|
464
521
|
"""DocTags-specific document serializer."""
|
|
465
522
|
|
|
@@ -473,6 +530,8 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
473
530
|
list_serializer: BaseListSerializer = DocTagsListSerializer()
|
|
474
531
|
inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
|
|
475
532
|
|
|
533
|
+
annotation_serializer: BaseAnnotationSerializer = DocTagsAnnotationSerializer()
|
|
534
|
+
|
|
476
535
|
params: DocTagsParams = DocTagsParams()
|
|
477
536
|
|
|
478
537
|
@override
|
|
@@ -21,6 +21,7 @@ from pydantic import AnyUrl, BaseModel
|
|
|
21
21
|
from typing_extensions import override
|
|
22
22
|
|
|
23
23
|
from docling_core.transforms.serializer.base import (
|
|
24
|
+
BaseAnnotationSerializer,
|
|
24
25
|
BaseDocSerializer,
|
|
25
26
|
BaseFallbackSerializer,
|
|
26
27
|
BaseFormSerializer,
|
|
@@ -35,7 +36,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
35
36
|
from docling_core.transforms.serializer.common import (
|
|
36
37
|
CommonParams,
|
|
37
38
|
DocSerializer,
|
|
38
|
-
|
|
39
|
+
_get_annotation_text,
|
|
39
40
|
create_ser_result,
|
|
40
41
|
)
|
|
41
42
|
from docling_core.transforms.serializer.html_styles import (
|
|
@@ -47,6 +48,7 @@ from docling_core.types.doc.base import ImageRefMode
|
|
|
47
48
|
from docling_core.types.doc.document import (
|
|
48
49
|
CodeItem,
|
|
49
50
|
ContentLayer,
|
|
51
|
+
DescriptionAnnotation,
|
|
50
52
|
DocItem,
|
|
51
53
|
DoclingDocument,
|
|
52
54
|
FloatingItem,
|
|
@@ -59,7 +61,9 @@ from docling_core.types.doc.document import (
|
|
|
59
61
|
ListItem,
|
|
60
62
|
NodeItem,
|
|
61
63
|
OrderedList,
|
|
64
|
+
PictureClassificationData,
|
|
62
65
|
PictureItem,
|
|
66
|
+
PictureMoleculeData,
|
|
63
67
|
PictureTabularChartData,
|
|
64
68
|
SectionHeaderItem,
|
|
65
69
|
TableCell,
|
|
@@ -758,14 +762,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
758
762
|
"""HTML-specific fallback serializer."""
|
|
759
763
|
|
|
760
764
|
@override
|
|
761
|
-
def serialize(
|
|
762
|
-
self,
|
|
763
|
-
*,
|
|
764
|
-
item: NodeItem,
|
|
765
|
-
doc_serializer: "BaseDocSerializer",
|
|
766
|
-
doc: DoclingDocument,
|
|
767
|
-
**kwargs: Any,
|
|
768
|
-
) -> SerializationResult:
|
|
765
|
+
def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
|
|
769
766
|
"""Fallback serializer for items not handled by other serializers."""
|
|
770
767
|
if isinstance(item, DocItem):
|
|
771
768
|
return create_ser_result(
|
|
@@ -777,6 +774,42 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
777
774
|
return create_ser_result()
|
|
778
775
|
|
|
779
776
|
|
|
777
|
+
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
778
|
+
"""HTML-specific annotation serializer."""
|
|
779
|
+
|
|
780
|
+
def serialize(
|
|
781
|
+
self,
|
|
782
|
+
*,
|
|
783
|
+
item: DocItem,
|
|
784
|
+
doc: DoclingDocument,
|
|
785
|
+
**kwargs: Any,
|
|
786
|
+
) -> SerializationResult:
|
|
787
|
+
"""Serializes the passed annotation to HTML format."""
|
|
788
|
+
res_parts: list[SerializationResult] = []
|
|
789
|
+
for ann in item.get_annotations():
|
|
790
|
+
if isinstance(
|
|
791
|
+
ann,
|
|
792
|
+
(PictureClassificationData, DescriptionAnnotation, PictureMoleculeData),
|
|
793
|
+
):
|
|
794
|
+
if ann_text := _get_annotation_text(ann):
|
|
795
|
+
text_dir = get_text_direction(ann_text)
|
|
796
|
+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
|
|
797
|
+
ann_ser_res = create_ser_result(
|
|
798
|
+
text=(
|
|
799
|
+
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
|
|
800
|
+
f"{html.escape(ann_text)}"
|
|
801
|
+
f"</div>"
|
|
802
|
+
),
|
|
803
|
+
span_source=item,
|
|
804
|
+
)
|
|
805
|
+
res_parts.append(ann_ser_res)
|
|
806
|
+
|
|
807
|
+
return create_ser_result(
|
|
808
|
+
text=" ".join([r.text for r in res_parts if r.text]),
|
|
809
|
+
span_source=res_parts,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
|
|
780
813
|
class HTMLDocSerializer(DocSerializer):
|
|
781
814
|
"""HTML-specific document serializer."""
|
|
782
815
|
|
|
@@ -790,6 +823,8 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
790
823
|
list_serializer: BaseListSerializer = HTMLListSerializer()
|
|
791
824
|
inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
|
|
792
825
|
|
|
826
|
+
annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
|
|
827
|
+
|
|
793
828
|
params: HTMLParams = HTMLParams()
|
|
794
829
|
|
|
795
830
|
@override
|
|
@@ -812,6 +847,16 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
812
847
|
"""Apply HTML-specific strikethrough serialization."""
|
|
813
848
|
return f"<del>{text}</del>"
|
|
814
849
|
|
|
850
|
+
@override
|
|
851
|
+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
|
|
852
|
+
"""Apply HTML-specific subscript serialization."""
|
|
853
|
+
return f"<sub>{text}</sub>"
|
|
854
|
+
|
|
855
|
+
@override
|
|
856
|
+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
|
|
857
|
+
"""Apply HTML-specific superscript serialization."""
|
|
858
|
+
return f"<sup>{text}</sup>"
|
|
859
|
+
|
|
815
860
|
@override
|
|
816
861
|
def serialize_hyperlink(
|
|
817
862
|
self,
|
|
@@ -968,20 +1013,13 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
968
1013
|
results.append(cap_ser_res)
|
|
969
1014
|
|
|
970
1015
|
if params.include_annotations and item.self_ref not in excluded_refs:
|
|
971
|
-
if isinstance(item, PictureItem):
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
|
|
979
|
-
f"{html.escape(ann_text)}"
|
|
980
|
-
f"</div>"
|
|
981
|
-
),
|
|
982
|
-
span_source=item,
|
|
983
|
-
)
|
|
984
|
-
results.append(ann_ser_res)
|
|
1016
|
+
if isinstance(item, (PictureItem, TableItem)):
|
|
1017
|
+
ann_res = self.serialize_annotations(
|
|
1018
|
+
item=item,
|
|
1019
|
+
**kwargs,
|
|
1020
|
+
)
|
|
1021
|
+
if ann_res.text:
|
|
1022
|
+
results.append(ann_res)
|
|
985
1023
|
|
|
986
1024
|
text_res = params.caption_delim.join([r.text for r in results])
|
|
987
1025
|
if text_res:
|
|
@@ -15,6 +15,7 @@ from tabulate import tabulate
|
|
|
15
15
|
from typing_extensions import override
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.serializer.base import (
|
|
18
|
+
BaseAnnotationSerializer,
|
|
18
19
|
BaseDocSerializer,
|
|
19
20
|
BaseFallbackSerializer,
|
|
20
21
|
BaseFormSerializer,
|
|
@@ -29,7 +30,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
29
30
|
from docling_core.transforms.serializer.common import (
|
|
30
31
|
CommonParams,
|
|
31
32
|
DocSerializer,
|
|
32
|
-
|
|
33
|
+
_get_annotation_text,
|
|
33
34
|
_PageBreakSerResult,
|
|
34
35
|
create_ser_result,
|
|
35
36
|
)
|
|
@@ -37,6 +38,7 @@ from docling_core.types.doc.base import ImageRefMode
|
|
|
37
38
|
from docling_core.types.doc.document import (
|
|
38
39
|
CodeItem,
|
|
39
40
|
ContentLayer,
|
|
41
|
+
DescriptionAnnotation,
|
|
40
42
|
DocItem,
|
|
41
43
|
DoclingDocument,
|
|
42
44
|
FloatingItem,
|
|
@@ -48,7 +50,9 @@ from docling_core.types.doc.document import (
|
|
|
48
50
|
KeyValueItem,
|
|
49
51
|
NodeItem,
|
|
50
52
|
OrderedList,
|
|
53
|
+
PictureClassificationData,
|
|
51
54
|
PictureItem,
|
|
55
|
+
PictureMoleculeData,
|
|
52
56
|
PictureTabularChartData,
|
|
53
57
|
SectionHeaderItem,
|
|
54
58
|
TableItem,
|
|
@@ -58,6 +62,23 @@ from docling_core.types.doc.document import (
|
|
|
58
62
|
)
|
|
59
63
|
|
|
60
64
|
|
|
65
|
+
def _get_annotation_ser_result(
|
|
66
|
+
ann_kind: str, ann_text: str, mark_annotation: bool, doc_item: DocItem
|
|
67
|
+
):
|
|
68
|
+
return create_ser_result(
|
|
69
|
+
text=(
|
|
70
|
+
(
|
|
71
|
+
f'<!--<annotation kind="{ann_kind}">-->'
|
|
72
|
+
f"{ann_text}"
|
|
73
|
+
f"<!--<annotation/>-->"
|
|
74
|
+
)
|
|
75
|
+
if mark_annotation
|
|
76
|
+
else ann_text
|
|
77
|
+
),
|
|
78
|
+
span_source=doc_item,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
61
82
|
class MarkdownParams(CommonParams):
|
|
62
83
|
"""Markdown-specific serialization parameters."""
|
|
63
84
|
|
|
@@ -136,6 +157,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
136
157
|
return create_ser_result(text=text, span_source=res_parts)
|
|
137
158
|
|
|
138
159
|
|
|
160
|
+
class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
161
|
+
"""Markdown-specific annotation serializer."""
|
|
162
|
+
|
|
163
|
+
def serialize(
|
|
164
|
+
self,
|
|
165
|
+
*,
|
|
166
|
+
item: DocItem,
|
|
167
|
+
doc: DoclingDocument,
|
|
168
|
+
**kwargs: Any,
|
|
169
|
+
) -> SerializationResult:
|
|
170
|
+
"""Serialize the item's annotations."""
|
|
171
|
+
params = MarkdownParams(**kwargs)
|
|
172
|
+
|
|
173
|
+
res_parts: list[SerializationResult] = []
|
|
174
|
+
for ann in item.get_annotations():
|
|
175
|
+
if isinstance(
|
|
176
|
+
ann,
|
|
177
|
+
(
|
|
178
|
+
PictureClassificationData,
|
|
179
|
+
DescriptionAnnotation,
|
|
180
|
+
PictureMoleculeData,
|
|
181
|
+
),
|
|
182
|
+
):
|
|
183
|
+
if ann_text := _get_annotation_text(ann):
|
|
184
|
+
ann_res = create_ser_result(
|
|
185
|
+
text=(
|
|
186
|
+
(
|
|
187
|
+
f'<!--<annotation kind="{ann.kind}">-->'
|
|
188
|
+
f"{ann_text}"
|
|
189
|
+
f"<!--<annotation/>-->"
|
|
190
|
+
)
|
|
191
|
+
if params.mark_annotations
|
|
192
|
+
else ann_text
|
|
193
|
+
),
|
|
194
|
+
span_source=item,
|
|
195
|
+
)
|
|
196
|
+
res_parts.append(ann_res)
|
|
197
|
+
return create_ser_result(
|
|
198
|
+
text="\n\n".join([r.text for r in res_parts if r.text]),
|
|
199
|
+
span_source=item,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
139
203
|
class MarkdownTableSerializer(BaseTableSerializer):
|
|
140
204
|
"""Markdown-specific table item serializer."""
|
|
141
205
|
|
|
@@ -149,6 +213,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
149
213
|
**kwargs: Any,
|
|
150
214
|
) -> SerializationResult:
|
|
151
215
|
"""Serializes the passed item."""
|
|
216
|
+
params = MarkdownParams(**kwargs)
|
|
152
217
|
res_parts: list[SerializationResult] = []
|
|
153
218
|
|
|
154
219
|
cap_res = doc_serializer.serialize_captions(
|
|
@@ -159,6 +224,16 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
159
224
|
res_parts.append(cap_res)
|
|
160
225
|
|
|
161
226
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
227
|
+
|
|
228
|
+
if params.include_annotations:
|
|
229
|
+
|
|
230
|
+
ann_res = doc_serializer.serialize_annotations(
|
|
231
|
+
item=item,
|
|
232
|
+
**kwargs,
|
|
233
|
+
)
|
|
234
|
+
if ann_res.text:
|
|
235
|
+
res_parts.append(ann_res)
|
|
236
|
+
|
|
162
237
|
rows = [
|
|
163
238
|
[
|
|
164
239
|
# make sure that md tables are not broken
|
|
@@ -214,22 +289,12 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
214
289
|
|
|
215
290
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
216
291
|
if params.include_annotations:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
f'<!--<annotation kind="{ann.kind}">-->'
|
|
224
|
-
f"{ann_text}"
|
|
225
|
-
f"<!--<annotation/>-->"
|
|
226
|
-
)
|
|
227
|
-
if params.mark_annotations
|
|
228
|
-
else ann_text
|
|
229
|
-
),
|
|
230
|
-
span_source=item,
|
|
231
|
-
)
|
|
232
|
-
res_parts.append(ann_ser_res)
|
|
292
|
+
ann_res = doc_serializer.serialize_annotations(
|
|
293
|
+
item=item,
|
|
294
|
+
**kwargs,
|
|
295
|
+
)
|
|
296
|
+
if ann_res.text:
|
|
297
|
+
res_parts.append(ann_res)
|
|
233
298
|
|
|
234
299
|
img_res = self._serialize_image_part(
|
|
235
300
|
item=item,
|
|
@@ -257,7 +322,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
257
322
|
res_parts.append(
|
|
258
323
|
create_ser_result(text=md_table_content, span_source=item)
|
|
259
324
|
)
|
|
260
|
-
text_res = "\n\n".join([r.text for r in res_parts])
|
|
325
|
+
text_res = "\n\n".join([r.text for r in res_parts if r.text])
|
|
261
326
|
|
|
262
327
|
return create_ser_result(text=text_res, span_source=res_parts)
|
|
263
328
|
|
|
@@ -471,6 +536,8 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
471
536
|
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
472
537
|
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
473
538
|
|
|
539
|
+
annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
|
|
540
|
+
|
|
474
541
|
params: MarkdownParams = MarkdownParams()
|
|
475
542
|
|
|
476
543
|
@override
|