docling-core 2.48.4__tar.gz → 2.50.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.48.4 → docling_core-2.50.0}/PKG-INFO +9 -4
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/cli/view.py +21 -5
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/serializer/base.py +31 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/serializer/common.py +180 -100
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/serializer/doctags.py +35 -20
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/serializer/html.py +78 -3
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/serializer/markdown.py +114 -5
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/doc/__init__.py +11 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/doc/document.py +359 -8
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/doc/tokens.py +6 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core.egg-info/PKG-INFO +9 -4
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core.egg-info/SOURCES.txt +1 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core.egg-info/requires.txt +2 -2
- {docling_core-2.48.4 → docling_core-2.50.0}/pyproject.toml +9 -4
- docling_core-2.50.0/test/test_metadata.py +301 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_serialization.py +44 -4
- {docling_core-2.48.4 → docling_core-2.50.0}/LICENSE +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/README.md +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/py.typed +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/search/package.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/setup.cfg +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_collection.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_doc_base.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_docling_doc.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_page.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_page_chunker.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_search_meta.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_utils.py +0 -0
- {docling_core-2.48.4 → docling_core-2.50.0}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.50.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -15,12 +15,17 @@ Classifier: Intended Audience :: Developers
|
|
|
15
15
|
Classifier: Intended Audience :: Science/Research
|
|
16
16
|
Classifier: Natural Language :: English
|
|
17
17
|
Classifier: Operating System :: OS Independent
|
|
18
|
-
Classifier: Programming Language :: Python :: 3
|
|
19
18
|
Classifier: Topic :: Database
|
|
20
19
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
20
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
21
|
Classifier: Typing :: Typed
|
|
23
22
|
Classifier: Programming Language :: Python :: 3
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
26
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
27
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
29
|
Requires-Python: <4.0,>=3.9
|
|
25
30
|
Description-Content-Type: text/markdown
|
|
26
31
|
License-File: LICENSE
|
|
@@ -29,7 +34,7 @@ Requires-Dist: pydantic!=2.10.0,!=2.10.1,!=2.10.2,<3.0.0,>=2.6.0
|
|
|
29
34
|
Requires-Dist: jsonref<2.0.0,>=1.1.0
|
|
30
35
|
Requires-Dist: tabulate<0.10.0,>=0.9.0
|
|
31
36
|
Requires-Dist: pandas<3.0.0,>=2.1.4
|
|
32
|
-
Requires-Dist: pillow<
|
|
37
|
+
Requires-Dist: pillow<13.0.0,>=10.0.0
|
|
33
38
|
Requires-Dist: pyyaml<7.0.0,>=5.1
|
|
34
39
|
Requires-Dist: typing-extensions<5.0.0,>=4.12.2
|
|
35
40
|
Requires-Dist: typer<0.20.0,>=0.12.5
|
|
@@ -39,7 +44,7 @@ Requires-Dist: semchunk<3.0.0,>=2.2.0; extra == "chunking"
|
|
|
39
44
|
Requires-Dist: transformers<5.0.0,>=4.34.0; extra == "chunking"
|
|
40
45
|
Provides-Extra: chunking-openai
|
|
41
46
|
Requires-Dist: semchunk; extra == "chunking-openai"
|
|
42
|
-
Requires-Dist: tiktoken<0.
|
|
47
|
+
Requires-Dist: tiktoken<0.13.0,>=0.9.0; extra == "chunking-openai"
|
|
43
48
|
Dynamic: license-file
|
|
44
49
|
|
|
45
50
|
# Docling Core
|
|
@@ -39,9 +39,17 @@ def view(
|
|
|
39
39
|
typer.Argument(
|
|
40
40
|
...,
|
|
41
41
|
metavar="source",
|
|
42
|
-
help="Docling JSON file to view.",
|
|
42
|
+
help="Docling JSON or YAML file to view.",
|
|
43
43
|
),
|
|
44
44
|
],
|
|
45
|
+
split_view: Annotated[
|
|
46
|
+
bool,
|
|
47
|
+
typer.Option(
|
|
48
|
+
"--split-view",
|
|
49
|
+
"-s",
|
|
50
|
+
help="Split view of the document.",
|
|
51
|
+
),
|
|
52
|
+
] = False,
|
|
45
53
|
version: Annotated[
|
|
46
54
|
Optional[bool],
|
|
47
55
|
typer.Option(
|
|
@@ -52,11 +60,19 @@ def view(
|
|
|
52
60
|
),
|
|
53
61
|
] = None,
|
|
54
62
|
):
|
|
55
|
-
"""Display a
|
|
63
|
+
"""Display a DoclingDocument file on the default browser."""
|
|
56
64
|
path = resolve_source_to_path(source=source)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
65
|
+
if path.suffix == ".json":
|
|
66
|
+
doc = DoclingDocument.load_from_json(filename=path)
|
|
67
|
+
elif path.suffix in [".yaml", ".yml"]:
|
|
68
|
+
doc = DoclingDocument.load_from_yaml(filename=path)
|
|
69
|
+
else:
|
|
70
|
+
raise ValueError(f"Unsupported file type: {path.suffix}")
|
|
71
|
+
target_path = Path(tempfile.mkdtemp()) / f"{path.stem}.html"
|
|
72
|
+
html_output = doc.export_to_html(
|
|
73
|
+
image_mode=ImageRefMode.EMBEDDED,
|
|
74
|
+
split_page_view=split_view,
|
|
75
|
+
)
|
|
60
76
|
with open(target_path, "w", encoding="utf-8") as f:
|
|
61
77
|
f.write(html_output)
|
|
62
78
|
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
|
|
@@ -9,6 +9,7 @@ from pathlib import Path
|
|
|
9
9
|
from typing import Any, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import AnyUrl, BaseModel
|
|
12
|
+
from typing_extensions import deprecated
|
|
12
13
|
|
|
13
14
|
from docling_core.types.doc.document import (
|
|
14
15
|
DocItem,
|
|
@@ -258,6 +259,7 @@ class BaseDocSerializer(ABC):
|
|
|
258
259
|
"""Serialize the item's captions."""
|
|
259
260
|
...
|
|
260
261
|
|
|
262
|
+
@deprecated("Use serialize_meta() instead.")
|
|
261
263
|
@abstractmethod
|
|
262
264
|
def serialize_annotations(
|
|
263
265
|
self,
|
|
@@ -267,6 +269,15 @@ class BaseDocSerializer(ABC):
|
|
|
267
269
|
"""Serialize the item's annotations."""
|
|
268
270
|
...
|
|
269
271
|
|
|
272
|
+
@abstractmethod
|
|
273
|
+
def serialize_meta(
|
|
274
|
+
self,
|
|
275
|
+
item: NodeItem,
|
|
276
|
+
**kwargs: Any,
|
|
277
|
+
) -> SerializationResult:
|
|
278
|
+
"""Serialize the item's meta."""
|
|
279
|
+
...
|
|
280
|
+
|
|
270
281
|
@abstractmethod
|
|
271
282
|
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
|
|
272
283
|
"""Get references to excluded items."""
|
|
@@ -287,6 +298,26 @@ class BaseSerializerProvider(ABC):
|
|
|
287
298
|
...
|
|
288
299
|
|
|
289
300
|
|
|
301
|
+
class BaseMetaSerializer(ABC):
|
|
302
|
+
"""Base class for meta serializers."""
|
|
303
|
+
|
|
304
|
+
@abstractmethod
|
|
305
|
+
def serialize(
|
|
306
|
+
self,
|
|
307
|
+
*,
|
|
308
|
+
item: NodeItem,
|
|
309
|
+
doc: DoclingDocument,
|
|
310
|
+
**kwargs: Any,
|
|
311
|
+
) -> SerializationResult:
|
|
312
|
+
"""Serializes the meta of the passed item."""
|
|
313
|
+
...
|
|
314
|
+
|
|
315
|
+
def _humanize_text(self, text: str, title: bool = False) -> str:
|
|
316
|
+
tmp = text.replace("__", "_").replace("_", " ")
|
|
317
|
+
return tmp.title() if title else tmp.capitalize()
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@deprecated("Use BaseMetaSerializer() instead.")
|
|
290
321
|
class BaseAnnotationSerializer(ABC):
|
|
291
322
|
"""Base class for annotation serializers."""
|
|
292
323
|
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Define base classes for serialization."""
|
|
7
|
+
import logging
|
|
7
8
|
import re
|
|
8
9
|
import sys
|
|
9
10
|
from abc import abstractmethod
|
|
@@ -11,7 +12,14 @@ from functools import cached_property
|
|
|
11
12
|
from pathlib import Path
|
|
12
13
|
from typing import Any, Iterable, Optional, Tuple, Union
|
|
13
14
|
|
|
14
|
-
from pydantic import
|
|
15
|
+
from pydantic import (
|
|
16
|
+
AnyUrl,
|
|
17
|
+
BaseModel,
|
|
18
|
+
ConfigDict,
|
|
19
|
+
Field,
|
|
20
|
+
NonNegativeInt,
|
|
21
|
+
computed_field,
|
|
22
|
+
)
|
|
15
23
|
from typing_extensions import Self, override
|
|
16
24
|
|
|
17
25
|
from docling_core.transforms.serializer.base import (
|
|
@@ -22,6 +30,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
22
30
|
BaseInlineSerializer,
|
|
23
31
|
BaseKeyValueSerializer,
|
|
24
32
|
BaseListSerializer,
|
|
33
|
+
BaseMetaSerializer,
|
|
25
34
|
BasePictureSerializer,
|
|
26
35
|
BaseTableSerializer,
|
|
27
36
|
BaseTextSerializer,
|
|
@@ -56,6 +65,9 @@ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
|
56
65
|
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
|
|
57
66
|
|
|
58
67
|
|
|
68
|
+
_logger = logging.getLogger(__name__)
|
|
69
|
+
|
|
70
|
+
|
|
59
71
|
class _PageBreakNode(NodeItem):
|
|
60
72
|
"""Page break node."""
|
|
61
73
|
|
|
@@ -76,11 +88,11 @@ def _iterate_items(
|
|
|
76
88
|
traverse_pictures: bool = False,
|
|
77
89
|
add_page_breaks: bool = False,
|
|
78
90
|
visited: Optional[set[str]] = None,
|
|
79
|
-
):
|
|
91
|
+
) -> Iterable[Tuple[NodeItem, int]]:
|
|
80
92
|
my_visited: set[str] = visited if visited is not None else set()
|
|
81
93
|
prev_page_nr: Optional[int] = None
|
|
82
94
|
page_break_i = 0
|
|
83
|
-
for item,
|
|
95
|
+
for item, lvl in doc.iterate_items(
|
|
84
96
|
root=node,
|
|
85
97
|
with_groups=True,
|
|
86
98
|
included_content_layers=layers,
|
|
@@ -93,7 +105,7 @@ def _iterate_items(
|
|
|
93
105
|
):
|
|
94
106
|
# if group starts with new page, yield page break before group node
|
|
95
107
|
my_visited.add(item.self_ref)
|
|
96
|
-
for it in _iterate_items(
|
|
108
|
+
for it, _ in _iterate_items(
|
|
97
109
|
doc=doc,
|
|
98
110
|
layers=layers,
|
|
99
111
|
node=item,
|
|
@@ -108,7 +120,7 @@ def _iterate_items(
|
|
|
108
120
|
self_ref=f"#/pb/{page_break_i}",
|
|
109
121
|
prev_page=prev_page_nr,
|
|
110
122
|
next_page=page_no,
|
|
111
|
-
)
|
|
123
|
+
), lvl
|
|
112
124
|
break
|
|
113
125
|
elif isinstance(item, DocItem) and item.prov:
|
|
114
126
|
page_no = item.prov[0].page_no
|
|
@@ -118,10 +130,10 @@ def _iterate_items(
|
|
|
118
130
|
self_ref=f"#/pb/{page_break_i}",
|
|
119
131
|
prev_page=prev_page_nr,
|
|
120
132
|
next_page=page_no,
|
|
121
|
-
)
|
|
133
|
+
), lvl
|
|
122
134
|
page_break_i += 1
|
|
123
135
|
prev_page_nr = page_no
|
|
124
|
-
yield item
|
|
136
|
+
yield item, lvl
|
|
125
137
|
|
|
126
138
|
|
|
127
139
|
def _get_annotation_text(
|
|
@@ -188,9 +200,22 @@ class CommonParams(BaseModel):
|
|
|
188
200
|
start_idx: NonNegativeInt = 0
|
|
189
201
|
stop_idx: NonNegativeInt = sys.maxsize
|
|
190
202
|
|
|
203
|
+
include_non_meta: bool = True
|
|
204
|
+
|
|
191
205
|
include_formatting: bool = True
|
|
192
206
|
include_hyperlinks: bool = True
|
|
193
207
|
caption_delim: str = " "
|
|
208
|
+
use_legacy_annotations: bool = Field(
|
|
209
|
+
default=False, description="Use legacy annotation serialization."
|
|
210
|
+
)
|
|
211
|
+
allowed_meta_names: Optional[set[str]] = Field(
|
|
212
|
+
default=None,
|
|
213
|
+
description="Meta name to allow; None means all meta names are allowed.",
|
|
214
|
+
)
|
|
215
|
+
blocked_meta_names: set[str] = Field(
|
|
216
|
+
default_factory=set,
|
|
217
|
+
description="Meta name to block; takes precedence over allowed_meta_names.",
|
|
218
|
+
)
|
|
194
219
|
|
|
195
220
|
def merge_with_patch(self, patch: dict[str, Any]) -> Self:
|
|
196
221
|
"""Create an instance by merging the provided patch dict on top of self."""
|
|
@@ -215,6 +240,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
215
240
|
list_serializer: BaseListSerializer
|
|
216
241
|
inline_serializer: BaseInlineSerializer
|
|
217
242
|
|
|
243
|
+
meta_serializer: Optional[BaseMetaSerializer] = None
|
|
218
244
|
annotation_serializer: BaseAnnotationSerializer
|
|
219
245
|
|
|
220
246
|
params: CommonParams = CommonParams()
|
|
@@ -245,7 +271,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
245
271
|
if refs is None:
|
|
246
272
|
refs = {
|
|
247
273
|
item.self_ref
|
|
248
|
-
for ix, item in enumerate(
|
|
274
|
+
for ix, (item, _) in enumerate(
|
|
249
275
|
_iterate_items(
|
|
250
276
|
doc=self.doc,
|
|
251
277
|
traverse_pictures=True,
|
|
@@ -301,103 +327,130 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
301
327
|
) -> SerializationResult:
|
|
302
328
|
"""Serialize a given node."""
|
|
303
329
|
my_visited: set[str] = visited if visited is not None else set()
|
|
330
|
+
parts: list[SerializationResult] = []
|
|
331
|
+
delim: str = kwargs.get("delim", "\n")
|
|
332
|
+
my_params = self.params.model_copy(update=kwargs)
|
|
304
333
|
my_kwargs = {**self.params.model_dump(), **kwargs}
|
|
305
334
|
empty_res = create_ser_result()
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
335
|
+
|
|
336
|
+
my_item = item or self.doc.body
|
|
337
|
+
|
|
338
|
+
if my_item == self.doc.body:
|
|
339
|
+
if my_item.meta and not my_params.use_legacy_annotations:
|
|
340
|
+
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
|
|
341
|
+
if meta_part.text:
|
|
342
|
+
parts.append(meta_part)
|
|
343
|
+
|
|
344
|
+
if my_item.self_ref not in my_visited:
|
|
345
|
+
my_visited.add(my_item.self_ref)
|
|
346
|
+
part = self._serialize_body(**my_kwargs)
|
|
347
|
+
if part.text:
|
|
348
|
+
parts.append(part)
|
|
349
|
+
return create_ser_result(
|
|
350
|
+
text=delim.join([p.text for p in parts if p.text]),
|
|
351
|
+
span_source=parts,
|
|
352
|
+
)
|
|
310
353
|
else:
|
|
311
354
|
return empty_res
|
|
312
355
|
|
|
313
|
-
my_visited.add(
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
356
|
+
my_visited.add(my_item.self_ref)
|
|
357
|
+
|
|
358
|
+
if my_item.meta and not my_params.use_legacy_annotations:
|
|
359
|
+
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
|
|
360
|
+
if meta_part.text:
|
|
361
|
+
parts.append(meta_part)
|
|
362
|
+
|
|
363
|
+
if my_params.include_non_meta:
|
|
364
|
+
########
|
|
365
|
+
# groups
|
|
366
|
+
########
|
|
367
|
+
if isinstance(my_item, ListGroup):
|
|
368
|
+
part = self.list_serializer.serialize(
|
|
369
|
+
item=my_item,
|
|
370
|
+
doc_serializer=self,
|
|
371
|
+
doc=self.doc,
|
|
372
|
+
list_level=list_level,
|
|
373
|
+
is_inline_scope=is_inline_scope,
|
|
374
|
+
visited=my_visited,
|
|
375
|
+
**my_kwargs,
|
|
376
|
+
)
|
|
377
|
+
elif isinstance(my_item, InlineGroup):
|
|
378
|
+
part = self.inline_serializer.serialize(
|
|
379
|
+
item=my_item,
|
|
380
|
+
doc_serializer=self,
|
|
381
|
+
doc=self.doc,
|
|
382
|
+
list_level=list_level,
|
|
383
|
+
visited=my_visited,
|
|
384
|
+
**my_kwargs,
|
|
385
|
+
)
|
|
386
|
+
###########
|
|
387
|
+
# doc items
|
|
388
|
+
###########
|
|
389
|
+
elif isinstance(my_item, TextItem):
|
|
390
|
+
if my_item.self_ref in self._captions_of_some_item:
|
|
391
|
+
# those captions will be handled by the floating item holding them
|
|
392
|
+
return empty_res
|
|
393
|
+
else:
|
|
394
|
+
part = (
|
|
395
|
+
self.text_serializer.serialize(
|
|
396
|
+
item=my_item,
|
|
397
|
+
doc_serializer=self,
|
|
398
|
+
doc=self.doc,
|
|
399
|
+
is_inline_scope=is_inline_scope,
|
|
400
|
+
visited=my_visited,
|
|
401
|
+
**my_kwargs,
|
|
402
|
+
)
|
|
403
|
+
if my_item.self_ref not in self.get_excluded_refs(**kwargs)
|
|
404
|
+
else empty_res
|
|
353
405
|
)
|
|
354
|
-
|
|
355
|
-
|
|
406
|
+
elif isinstance(my_item, TableItem):
|
|
407
|
+
part = self.table_serializer.serialize(
|
|
408
|
+
item=my_item,
|
|
409
|
+
doc_serializer=self,
|
|
410
|
+
doc=self.doc,
|
|
411
|
+
visited=my_visited,
|
|
412
|
+
**my_kwargs,
|
|
356
413
|
)
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
)
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
visited=my_visited,
|
|
398
|
-
**my_kwargs,
|
|
399
|
-
)
|
|
400
|
-
return part
|
|
414
|
+
elif isinstance(my_item, PictureItem):
|
|
415
|
+
part = self.picture_serializer.serialize(
|
|
416
|
+
item=my_item,
|
|
417
|
+
doc_serializer=self,
|
|
418
|
+
doc=self.doc,
|
|
419
|
+
visited=my_visited,
|
|
420
|
+
**my_kwargs,
|
|
421
|
+
)
|
|
422
|
+
elif isinstance(my_item, KeyValueItem):
|
|
423
|
+
part = self.key_value_serializer.serialize(
|
|
424
|
+
item=my_item,
|
|
425
|
+
doc_serializer=self,
|
|
426
|
+
doc=self.doc,
|
|
427
|
+
**my_kwargs,
|
|
428
|
+
)
|
|
429
|
+
elif isinstance(my_item, FormItem):
|
|
430
|
+
part = self.form_serializer.serialize(
|
|
431
|
+
item=my_item,
|
|
432
|
+
doc_serializer=self,
|
|
433
|
+
doc=self.doc,
|
|
434
|
+
**my_kwargs,
|
|
435
|
+
)
|
|
436
|
+
elif isinstance(my_item, _PageBreakNode):
|
|
437
|
+
part = _PageBreakSerResult(
|
|
438
|
+
text=self._create_page_break(node=my_item),
|
|
439
|
+
node=my_item,
|
|
440
|
+
)
|
|
441
|
+
else:
|
|
442
|
+
part = self.fallback_serializer.serialize(
|
|
443
|
+
item=my_item,
|
|
444
|
+
doc_serializer=self,
|
|
445
|
+
doc=self.doc,
|
|
446
|
+
visited=my_visited,
|
|
447
|
+
**my_kwargs,
|
|
448
|
+
)
|
|
449
|
+
parts.append(part)
|
|
450
|
+
|
|
451
|
+
return create_ser_result(
|
|
452
|
+
text=delim.join([p.text for p in parts if p.text]), span_source=parts
|
|
453
|
+
)
|
|
401
454
|
|
|
402
455
|
# making some assumptions about the kwargs it can pass
|
|
403
456
|
@override
|
|
@@ -416,7 +469,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
416
469
|
my_visited: set[str] = visited if visited is not None else set()
|
|
417
470
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
418
471
|
|
|
419
|
-
for node in _iterate_items(
|
|
472
|
+
for node, lvl in _iterate_items(
|
|
420
473
|
node=item,
|
|
421
474
|
doc=self.doc,
|
|
422
475
|
layers=params.layers,
|
|
@@ -426,15 +479,17 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
426
479
|
continue
|
|
427
480
|
else:
|
|
428
481
|
my_visited.add(node.self_ref)
|
|
482
|
+
|
|
429
483
|
part = self.serialize(
|
|
430
484
|
item=node,
|
|
431
485
|
list_level=list_level,
|
|
432
486
|
is_inline_scope=is_inline_scope,
|
|
433
487
|
visited=my_visited,
|
|
434
|
-
**kwargs,
|
|
488
|
+
**(dict(level=lvl) | kwargs),
|
|
435
489
|
)
|
|
436
490
|
if part.text:
|
|
437
491
|
parts.append(part)
|
|
492
|
+
|
|
438
493
|
return parts
|
|
439
494
|
|
|
440
495
|
@override
|
|
@@ -528,6 +583,31 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
528
583
|
text_res = ""
|
|
529
584
|
return create_ser_result(text=text_res, span_source=results)
|
|
530
585
|
|
|
586
|
+
@override
|
|
587
|
+
def serialize_meta(
|
|
588
|
+
self,
|
|
589
|
+
item: NodeItem,
|
|
590
|
+
**kwargs: Any,
|
|
591
|
+
) -> SerializationResult:
|
|
592
|
+
"""Serialize the item's meta."""
|
|
593
|
+
if self.meta_serializer:
|
|
594
|
+
if item.self_ref not in self.get_excluded_refs(**kwargs):
|
|
595
|
+
return self.meta_serializer.serialize(
|
|
596
|
+
item=item,
|
|
597
|
+
doc=self.doc,
|
|
598
|
+
**(self.params.model_dump() | kwargs),
|
|
599
|
+
)
|
|
600
|
+
else:
|
|
601
|
+
return create_ser_result(
|
|
602
|
+
text="", span_source=item if isinstance(item, DocItem) else []
|
|
603
|
+
)
|
|
604
|
+
else:
|
|
605
|
+
_logger.warning("No meta serializer found.")
|
|
606
|
+
return create_ser_result(
|
|
607
|
+
text="", span_source=item if isinstance(item, DocItem) else []
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
# TODO deprecate
|
|
531
611
|
@override
|
|
532
612
|
def serialize_annotations(
|
|
533
613
|
self,
|
|
@@ -44,6 +44,7 @@ from docling_core.types.doc.document import (
|
|
|
44
44
|
PictureTabularChartData,
|
|
45
45
|
ProvenanceItem,
|
|
46
46
|
SectionHeaderItem,
|
|
47
|
+
TableData,
|
|
47
48
|
TableItem,
|
|
48
49
|
TextItem,
|
|
49
50
|
)
|
|
@@ -233,13 +234,22 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
233
234
|
ysize=params.ysize,
|
|
234
235
|
)
|
|
235
236
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
237
|
+
# handle classification data
|
|
238
|
+
predicted_class: Optional[str] = None
|
|
239
|
+
if item.meta and item.meta.classification:
|
|
240
|
+
predicted_class = (
|
|
241
|
+
item.meta.classification.get_main_prediction().class_name
|
|
242
|
+
)
|
|
243
|
+
elif (
|
|
244
|
+
classifications := [
|
|
245
|
+
ann
|
|
246
|
+
for ann in item.annotations
|
|
247
|
+
if isinstance(ann, PictureClassificationData)
|
|
248
|
+
]
|
|
249
|
+
) and classifications[0].predicted_classes:
|
|
242
250
|
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
251
|
+
if predicted_class:
|
|
252
|
+
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
243
253
|
if predicted_class in [
|
|
244
254
|
PictureClassificationLabel.PIE_CHART,
|
|
245
255
|
PictureClassificationLabel.BAR_CHART,
|
|
@@ -250,26 +260,31 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
250
260
|
PictureClassificationLabel.HEATMAP,
|
|
251
261
|
]:
|
|
252
262
|
is_chart = True
|
|
253
|
-
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
254
263
|
|
|
255
|
-
|
|
264
|
+
# handle molecule data
|
|
265
|
+
smi: Optional[str] = None
|
|
266
|
+
if item.meta and item.meta.molecule:
|
|
267
|
+
smi = item.meta.molecule.smi
|
|
268
|
+
elif smiles_annotations := [
|
|
256
269
|
ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
|
|
257
|
-
]
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
270
|
+
]:
|
|
271
|
+
smi = smiles_annotations[0].smi
|
|
272
|
+
if smi:
|
|
273
|
+
body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value)
|
|
274
|
+
|
|
275
|
+
# handle tabular chart data
|
|
276
|
+
chart_data: Optional[TableData] = None
|
|
277
|
+
if item.meta and item.meta.tabular_chart:
|
|
278
|
+
chart_data = item.meta.tabular_chart.chart_data
|
|
279
|
+
elif tabular_chart_annotations := [
|
|
264
280
|
ann
|
|
265
281
|
for ann in item.annotations
|
|
266
282
|
if isinstance(ann, PictureTabularChartData)
|
|
267
|
-
]
|
|
268
|
-
|
|
283
|
+
]:
|
|
284
|
+
chart_data = tabular_chart_annotations[0].chart_data
|
|
285
|
+
if chart_data and chart_data.table_cells:
|
|
269
286
|
temp_doc = DoclingDocument(name="temp")
|
|
270
|
-
temp_table = temp_doc.add_table(
|
|
271
|
-
data=tabular_chart_annotations[0].chart_data
|
|
272
|
-
)
|
|
287
|
+
temp_table = temp_doc.add_table(data=chart_data)
|
|
273
288
|
otsl_content = temp_table.export_to_otsl(
|
|
274
289
|
temp_doc, add_cell_location=False
|
|
275
290
|
)
|