docling-core 2.31.2__tar.gz → 2.33.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core-2.33.0/PKG-INFO +143 -0
- docling_core-2.33.0/README.md +99 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/hybrid_chunker.py +7 -8
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +20 -12
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/common.py +23 -6
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/html.py +42 -11
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/markdown.py +21 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/base.py +38 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/document.py +24 -4
- docling_core-2.33.0/docling_core.egg-info/PKG-INFO +143 -0
- docling_core-2.33.0/docling_core.egg-info/SOURCES.txt +104 -0
- docling_core-2.33.0/docling_core.egg-info/dependency_links.txt +1 -0
- docling_core-2.33.0/docling_core.egg-info/entry_points.txt +2 -0
- docling_core-2.33.0/docling_core.egg-info/requires.txt +18 -0
- docling_core-2.33.0/docling_core.egg-info/top_level.txt +1 -0
- docling_core-2.33.0/pyproject.toml +157 -0
- docling_core-2.33.0/setup.cfg +4 -0
- docling_core-2.33.0/test/test_base.py +295 -0
- docling_core-2.33.0/test/test_collection.py +158 -0
- docling_core-2.33.0/test/test_data_gen_flag.py +9 -0
- docling_core-2.33.0/test/test_doc_base.py +45 -0
- docling_core-2.33.0/test/test_doc_legacy_convert.py +40 -0
- docling_core-2.33.0/test/test_doc_schema.py +147 -0
- docling_core-2.33.0/test/test_doc_schema_extractor.py +31 -0
- docling_core-2.33.0/test/test_docling_doc.py +1603 -0
- docling_core-2.33.0/test/test_doctags_load.py +143 -0
- docling_core-2.33.0/test/test_hierarchical_chunker.py +73 -0
- docling_core-2.33.0/test/test_hybrid_chunker.py +384 -0
- docling_core-2.33.0/test/test_json_schema_to_search_mapper.py +106 -0
- docling_core-2.33.0/test/test_nlp_qa.py +46 -0
- docling_core-2.33.0/test/test_otsl_table_export.py +284 -0
- docling_core-2.33.0/test/test_page.py +79 -0
- docling_core-2.33.0/test/test_rec_schema.py +268 -0
- docling_core-2.33.0/test/test_search_meta.py +49 -0
- docling_core-2.33.0/test/test_serialization.py +372 -0
- docling_core-2.33.0/test/test_utils.py +94 -0
- docling_core-2.33.0/test/test_visualization.py +42 -0
- docling_core-2.31.2/PKG-INFO +0 -143
- docling_core-2.31.2/README.md +0 -97
- docling_core-2.31.2/pyproject.toml +0 -168
- {docling_core-2.31.2 → docling_core-2.33.0}/LICENSE +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/py.typed +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/package.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/base.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/validators.py +0 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docling-core
|
|
3
|
+
Version: 2.33.0
|
|
4
|
+
Summary: A python library to define and validate data types in Docling.
|
|
5
|
+
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
|
+
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: homepage, https://github.com/docling-project
|
|
9
|
+
Project-URL: repository, https://github.com/docling-project/docling-core
|
|
10
|
+
Project-URL: issues, https://github.com/docling-project/docling-core/issues
|
|
11
|
+
Project-URL: changelog, https://github.com/docling-project/docling-core/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: docling,discovery,etl,information retrieval,analytics,database,database schema,schema,JSON
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Classifier: Programming Language :: Python :: 3
|
|
24
|
+
Requires-Python: <4.0,>=3.9
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: jsonschema<5.0.0,>=4.16.0
|
|
28
|
+
Requires-Dist: pydantic!=2.10.0,!=2.10.1,!=2.10.2,<3.0.0,>=2.6.0
|
|
29
|
+
Requires-Dist: jsonref<2.0.0,>=1.1.0
|
|
30
|
+
Requires-Dist: tabulate<0.10.0,>=0.9.0
|
|
31
|
+
Requires-Dist: pandas<3.0.0,>=2.1.4
|
|
32
|
+
Requires-Dist: pillow<12.0.0,>=10.0.0
|
|
33
|
+
Requires-Dist: pyyaml<7.0.0,>=5.1
|
|
34
|
+
Requires-Dist: typing-extensions<5.0.0,>=4.12.2
|
|
35
|
+
Requires-Dist: typer<0.16.0,>=0.12.5
|
|
36
|
+
Requires-Dist: latex2mathml<4.0.0,>=3.77.0
|
|
37
|
+
Provides-Extra: chunking
|
|
38
|
+
Requires-Dist: semchunk<3.0.0,>=2.2.0; extra == "chunking"
|
|
39
|
+
Requires-Dist: transformers<5.0.0,>=4.34.0; extra == "chunking"
|
|
40
|
+
Provides-Extra: chunking-openai
|
|
41
|
+
Requires-Dist: semchunk; extra == "chunking-openai"
|
|
42
|
+
Requires-Dist: tiktoken<0.10.0,>=0.9.0; extra == "chunking-openai"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# Docling Core
|
|
46
|
+
|
|
47
|
+
[](https://pypi.org/project/docling-core/)
|
|
48
|
+

|
|
49
|
+
[](https://github.com/astral-sh/uv)
|
|
50
|
+
[](https://github.com/psf/black)
|
|
51
|
+
[](https://pycqa.github.io/isort/)
|
|
52
|
+
[](https://mypy-lang.org/)
|
|
53
|
+
[](https://pydantic.dev)
|
|
54
|
+
[](https://github.com/pre-commit/pre-commit)
|
|
55
|
+
[](https://opensource.org/licenses/MIT)
|
|
56
|
+
|
|
57
|
+
Docling Core is a library that defines core data types and transformations in [Docling](https://github.com/docling-project/docling).
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
To use Docling Core, simply install `docling-core` from your package manager, e.g. pip:
|
|
62
|
+
```bash
|
|
63
|
+
pip install docling-core
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Development setup
|
|
67
|
+
|
|
68
|
+
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and uv. You can then install from your local clone's root dir:
|
|
69
|
+
```bash
|
|
70
|
+
uv sync --all-extras
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
To run the pytest suite, execute:
|
|
74
|
+
```
|
|
75
|
+
uv run pytest -s test
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Main features
|
|
79
|
+
|
|
80
|
+
Docling Core provides the foundational DoclingDocument data model and API, as well as
|
|
81
|
+
additional APIs for tasks like serialization and chunking, which are key to developing
|
|
82
|
+
generative AI applications using Docling.
|
|
83
|
+
|
|
84
|
+
### DoclingDocument
|
|
85
|
+
|
|
86
|
+
Docling Core defines the DoclingDocument as a Pydantic model, allowing for advanced
|
|
87
|
+
data model control, customizability, and interoperability.
|
|
88
|
+
|
|
89
|
+
In addition to specifying the schema, it provides a handy API for building documents,
|
|
90
|
+
as well as for basic operations, e.g. exporting to various formats, like Markdown, HTML,
|
|
91
|
+
and others.
|
|
92
|
+
|
|
93
|
+
👉 More details:
|
|
94
|
+
- [Architecture docs](https://docling-project.github.io/docling/concepts/architecture/)
|
|
95
|
+
- [DoclingDocument docs](https://docling-project.github.io/docling/concepts/docling_document/)
|
|
96
|
+
|
|
97
|
+
### Serialization
|
|
98
|
+
|
|
99
|
+
Different users can have varying requirements when it comes to serialization.
|
|
100
|
+
To address this, the Serialization API introduces a design that allows easy extension,
|
|
101
|
+
while providing feature-rich built-in implementations (on which the respective
|
|
102
|
+
DoclingDocument helpers are actually based).
|
|
103
|
+
|
|
104
|
+
👉 More details:
|
|
105
|
+
- [Serialization docs](https://docling-project.github.io/docling/concepts/serialization/)
|
|
106
|
+
- [Serialization example](https://docling-project.github.io/docling/examples/serialization/)
|
|
107
|
+
|
|
108
|
+
### Chunking
|
|
109
|
+
|
|
110
|
+
Similarly to above, the Chunking API provides built-in chunking capabilities as well as
|
|
111
|
+
a design that enables easy extension, this way tackling customization requirements of
|
|
112
|
+
different use cases.
|
|
113
|
+
|
|
114
|
+
👉 More details:
|
|
115
|
+
- [Chunking docs](https://docling-project.github.io/docling/concepts/chunking/)
|
|
116
|
+
- [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/)
|
|
117
|
+
- [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/)
|
|
118
|
+
|
|
119
|
+
## Contributing
|
|
120
|
+
|
|
121
|
+
Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
|
|
122
|
+
|
|
123
|
+
## References
|
|
124
|
+
|
|
125
|
+
If you use Docling Core in your projects, please consider citing the following:
|
|
126
|
+
|
|
127
|
+
```bib
|
|
128
|
+
@techreport{Docling,
|
|
129
|
+
author = "Deep Search Team",
|
|
130
|
+
month = 8,
|
|
131
|
+
title = "Docling Technical Report",
|
|
132
|
+
url = "https://arxiv.org/abs/2408.09869",
|
|
133
|
+
eprint = "2408.09869",
|
|
134
|
+
doi = "10.48550/arXiv.2408.09869",
|
|
135
|
+
version = "1.0.0",
|
|
136
|
+
year = 2024
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
The Docling Core codebase is under MIT license.
|
|
143
|
+
For individual model usage, please refer to the model licenses found in the original packages.
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# Docling Core
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/docling-core/)
|
|
4
|
+

|
|
5
|
+
[](https://github.com/astral-sh/uv)
|
|
6
|
+
[](https://github.com/psf/black)
|
|
7
|
+
[](https://pycqa.github.io/isort/)
|
|
8
|
+
[](https://mypy-lang.org/)
|
|
9
|
+
[](https://pydantic.dev)
|
|
10
|
+
[](https://github.com/pre-commit/pre-commit)
|
|
11
|
+
[](https://opensource.org/licenses/MIT)
|
|
12
|
+
|
|
13
|
+
Docling Core is a library that defines core data types and transformations in [Docling](https://github.com/docling-project/docling).
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
To use Docling Core, simply install `docling-core` from your package manager, e.g. pip:
|
|
18
|
+
```bash
|
|
19
|
+
pip install docling-core
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Development setup
|
|
23
|
+
|
|
24
|
+
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and uv. You can then install from your local clone's root dir:
|
|
25
|
+
```bash
|
|
26
|
+
uv sync --all-extras
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
To run the pytest suite, execute:
|
|
30
|
+
```
|
|
31
|
+
uv run pytest -s test
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Main features
|
|
35
|
+
|
|
36
|
+
Docling Core provides the foundational DoclingDocument data model and API, as well as
|
|
37
|
+
additional APIs for tasks like serialization and chunking, which are key to developing
|
|
38
|
+
generative AI applications using Docling.
|
|
39
|
+
|
|
40
|
+
### DoclingDocument
|
|
41
|
+
|
|
42
|
+
Docling Core defines the DoclingDocument as a Pydantic model, allowing for advanced
|
|
43
|
+
data model control, customizability, and interoperability.
|
|
44
|
+
|
|
45
|
+
In addition to specifying the schema, it provides a handy API for building documents,
|
|
46
|
+
as well as for basic operations, e.g. exporting to various formats, like Markdown, HTML,
|
|
47
|
+
and others.
|
|
48
|
+
|
|
49
|
+
👉 More details:
|
|
50
|
+
- [Architecture docs](https://docling-project.github.io/docling/concepts/architecture/)
|
|
51
|
+
- [DoclingDocument docs](https://docling-project.github.io/docling/concepts/docling_document/)
|
|
52
|
+
|
|
53
|
+
### Serialization
|
|
54
|
+
|
|
55
|
+
Different users can have varying requirements when it comes to serialization.
|
|
56
|
+
To address this, the Serialization API introduces a design that allows easy extension,
|
|
57
|
+
while providing feature-rich built-in implementations (on which the respective
|
|
58
|
+
DoclingDocument helpers are actually based).
|
|
59
|
+
|
|
60
|
+
👉 More details:
|
|
61
|
+
- [Serialization docs](https://docling-project.github.io/docling/concepts/serialization/)
|
|
62
|
+
- [Serialization example](https://docling-project.github.io/docling/examples/serialization/)
|
|
63
|
+
|
|
64
|
+
### Chunking
|
|
65
|
+
|
|
66
|
+
Similarly to above, the Chunking API provides built-in chunking capabilities as well as
|
|
67
|
+
a design that enables easy extension, this way tackling customization requirements of
|
|
68
|
+
different use cases.
|
|
69
|
+
|
|
70
|
+
👉 More details:
|
|
71
|
+
- [Chunking docs](https://docling-project.github.io/docling/concepts/chunking/)
|
|
72
|
+
- [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/)
|
|
73
|
+
- [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/)
|
|
74
|
+
|
|
75
|
+
## Contributing
|
|
76
|
+
|
|
77
|
+
Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
|
|
78
|
+
|
|
79
|
+
## References
|
|
80
|
+
|
|
81
|
+
If you use Docling Core in your projects, please consider citing the following:
|
|
82
|
+
|
|
83
|
+
```bib
|
|
84
|
+
@techreport{Docling,
|
|
85
|
+
author = "Deep Search Team",
|
|
86
|
+
month = 8,
|
|
87
|
+
title = "Docling Technical Report",
|
|
88
|
+
url = "https://arxiv.org/abs/2408.09869",
|
|
89
|
+
eprint = "2408.09869",
|
|
90
|
+
doi = "10.48550/arXiv.2408.09869",
|
|
91
|
+
version = "1.0.0",
|
|
92
|
+
year = 2024
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## License
|
|
97
|
+
|
|
98
|
+
The Docling Core codebase is under MIT license.
|
|
99
|
+
For individual model usage, please refer to the model licenses found in the original packages.
|
{docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
@@ -88,7 +88,6 @@ class HybridChunker(BaseChunker):
|
|
|
88
88
|
"For updated usage check out "
|
|
89
89
|
"https://docling-project.github.io/docling/examples/hybrid_chunking/",
|
|
90
90
|
DeprecationWarning,
|
|
91
|
-
stacklevel=3,
|
|
92
91
|
)
|
|
93
92
|
|
|
94
93
|
if isinstance(tokenizer, str):
|
|
@@ -156,7 +155,6 @@ class HybridChunker(BaseChunker):
|
|
|
156
155
|
meta = DocMeta(
|
|
157
156
|
doc_items=doc_items,
|
|
158
157
|
headings=doc_chunk.meta.headings,
|
|
159
|
-
captions=doc_chunk.meta.captions,
|
|
160
158
|
origin=doc_chunk.meta.origin,
|
|
161
159
|
)
|
|
162
160
|
window_text = (
|
|
@@ -235,7 +233,9 @@ class HybridChunker(BaseChunker):
|
|
|
235
233
|
)
|
|
236
234
|
if available_length <= 0:
|
|
237
235
|
warnings.warn(
|
|
238
|
-
|
|
236
|
+
"Headers and captions for this chunk are longer than the total "
|
|
237
|
+
"amount of size for the chunk, chunk will be ignored: "
|
|
238
|
+
f"{doc_chunk.text=}"
|
|
239
239
|
)
|
|
240
240
|
return []
|
|
241
241
|
text = doc_chunk.text
|
|
@@ -250,10 +250,10 @@ class HybridChunker(BaseChunker):
|
|
|
250
250
|
num_chunks = len(chunks)
|
|
251
251
|
while window_end < num_chunks:
|
|
252
252
|
chunk = chunks[window_end]
|
|
253
|
-
|
|
253
|
+
headings = chunk.meta.headings
|
|
254
254
|
ready_to_append = False
|
|
255
255
|
if window_start == window_end:
|
|
256
|
-
|
|
256
|
+
current_headings = headings
|
|
257
257
|
window_end += 1
|
|
258
258
|
first_chunk_of_window = chunk
|
|
259
259
|
else:
|
|
@@ -264,13 +264,12 @@ class HybridChunker(BaseChunker):
|
|
|
264
264
|
text=self.delim.join([chk.text for chk in chks]),
|
|
265
265
|
meta=DocMeta(
|
|
266
266
|
doc_items=doc_items,
|
|
267
|
-
headings=
|
|
268
|
-
captions=current_headings_and_captions[1],
|
|
267
|
+
headings=current_headings,
|
|
269
268
|
origin=chunk.meta.origin,
|
|
270
269
|
),
|
|
271
270
|
)
|
|
272
271
|
if (
|
|
273
|
-
|
|
272
|
+
headings == current_headings
|
|
274
273
|
and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
|
|
275
274
|
):
|
|
276
275
|
# there is room to include the new chunk so add it to the window and
|
{docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""HuggingFace tokenization."""
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import json
|
|
4
4
|
from os import PathLike
|
|
5
5
|
from typing import Optional, Union
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from huggingface_hub import hf_hub_download
|
|
8
|
+
from pydantic import ConfigDict, model_validator
|
|
8
9
|
from typing_extensions import Self
|
|
9
10
|
|
|
10
11
|
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
|
|
@@ -28,16 +29,23 @@ class HuggingFaceTokenizer(BaseTokenizer):
|
|
|
28
29
|
|
|
29
30
|
@model_validator(mode="after")
|
|
30
31
|
def _patch(self) -> Self:
|
|
31
|
-
if
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
32
|
+
if self.max_tokens is None:
|
|
33
|
+
try:
|
|
34
|
+
# try to use SentenceTransformers-specific config as that seems to be
|
|
35
|
+
# reliable (whenever available)
|
|
36
|
+
config_name = "sentence_bert_config.json"
|
|
37
|
+
config_path = hf_hub_download(
|
|
38
|
+
repo_id=self.tokenizer.name_or_path,
|
|
39
|
+
filename=config_name,
|
|
40
|
+
)
|
|
41
|
+
with open(config_path) as f:
|
|
42
|
+
data = json.load(f)
|
|
43
|
+
self.max_tokens = int(data["max_seq_length"])
|
|
44
|
+
except Exception as e:
|
|
45
|
+
raise RuntimeError(
|
|
46
|
+
"max_tokens could not be determined automatically; please set "
|
|
47
|
+
"explicitly."
|
|
48
|
+
) from e
|
|
41
49
|
return self
|
|
42
50
|
|
|
43
51
|
def count_tokens(self, text: str):
|
|
@@ -11,7 +11,7 @@ from functools import cached_property
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any, Iterable, Optional, Tuple, Union
|
|
13
13
|
|
|
14
|
-
from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
|
|
14
|
+
from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_field
|
|
15
15
|
from typing_extensions import Self, override
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.serializer.base import (
|
|
@@ -39,7 +39,11 @@ from docling_core.types.doc.document import (
|
|
|
39
39
|
KeyValueItem,
|
|
40
40
|
NodeItem,
|
|
41
41
|
OrderedList,
|
|
42
|
+
PictureClassificationData,
|
|
43
|
+
PictureDataType,
|
|
44
|
+
PictureDescriptionData,
|
|
42
45
|
PictureItem,
|
|
46
|
+
PictureMoleculeData,
|
|
43
47
|
TableItem,
|
|
44
48
|
TextItem,
|
|
45
49
|
UnorderedList,
|
|
@@ -118,6 +122,23 @@ def _iterate_items(
|
|
|
118
122
|
yield item
|
|
119
123
|
|
|
120
124
|
|
|
125
|
+
def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
|
|
126
|
+
result = None
|
|
127
|
+
if isinstance(annotation, PictureClassificationData):
|
|
128
|
+
predicted_class = (
|
|
129
|
+
annotation.predicted_classes[0].class_name
|
|
130
|
+
if annotation.predicted_classes
|
|
131
|
+
else None
|
|
132
|
+
)
|
|
133
|
+
if predicted_class is not None:
|
|
134
|
+
result = predicted_class.replace("_", " ")
|
|
135
|
+
elif isinstance(annotation, PictureDescriptionData):
|
|
136
|
+
result = annotation.text
|
|
137
|
+
elif isinstance(annotation, PictureMoleculeData):
|
|
138
|
+
result = annotation.smi
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
|
|
121
142
|
def create_ser_result(
|
|
122
143
|
*,
|
|
123
144
|
text: str = "",
|
|
@@ -176,11 +197,7 @@ class CommonParams(BaseModel):
|
|
|
176
197
|
class DocSerializer(BaseModel, BaseDocSerializer):
|
|
177
198
|
"""Class for document serializers."""
|
|
178
199
|
|
|
179
|
-
|
|
180
|
-
"""Pydantic config."""
|
|
181
|
-
|
|
182
|
-
arbitrary_types_allowed = True
|
|
183
|
-
extra = "forbid"
|
|
200
|
+
model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
|
|
184
201
|
|
|
185
202
|
doc: DoclingDocument
|
|
186
203
|
|
|
@@ -35,6 +35,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
35
35
|
from docling_core.transforms.serializer.common import (
|
|
36
36
|
CommonParams,
|
|
37
37
|
DocSerializer,
|
|
38
|
+
_get_picture_annotation_text,
|
|
38
39
|
create_ser_result,
|
|
39
40
|
)
|
|
40
41
|
from docling_core.transforms.serializer.html_styles import (
|
|
@@ -110,6 +111,8 @@ class HTMLParams(CommonParams):
|
|
|
110
111
|
# Enable charts to be printed into HTML as tables
|
|
111
112
|
enable_chart_tables: bool = True
|
|
112
113
|
|
|
114
|
+
include_annotations: bool = True
|
|
115
|
+
|
|
113
116
|
|
|
114
117
|
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
115
118
|
"""HTML-specific text item serializer."""
|
|
@@ -943,18 +946,46 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
943
946
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
944
947
|
results: list[SerializationResult] = []
|
|
945
948
|
text_res = ""
|
|
949
|
+
excluded_refs = self.get_excluded_refs(**kwargs)
|
|
950
|
+
|
|
946
951
|
if DocItemLabel.CAPTION in params.labels:
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
952
|
+
for cap in item.captions:
|
|
953
|
+
if (
|
|
954
|
+
isinstance(it := cap.resolve(self.doc), TextItem)
|
|
955
|
+
and it.self_ref not in excluded_refs
|
|
956
|
+
):
|
|
957
|
+
text_cap = it.text
|
|
958
|
+
text_dir = get_text_direction(text_cap)
|
|
959
|
+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
|
|
960
|
+
cap_ser_res = create_ser_result(
|
|
961
|
+
text=(
|
|
962
|
+
f'<div class="caption"{dir_str}>'
|
|
963
|
+
f"{html.escape(text_cap)}"
|
|
964
|
+
f"</div>"
|
|
965
|
+
),
|
|
966
|
+
span_source=it,
|
|
967
|
+
)
|
|
968
|
+
results.append(cap_ser_res)
|
|
969
|
+
|
|
970
|
+
if params.include_annotations and item.self_ref not in excluded_refs:
|
|
971
|
+
if isinstance(item, PictureItem):
|
|
972
|
+
for ann in item.annotations:
|
|
973
|
+
if ann_text := _get_picture_annotation_text(annotation=ann):
|
|
974
|
+
text_dir = get_text_direction(ann_text)
|
|
975
|
+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
|
|
976
|
+
ann_ser_res = create_ser_result(
|
|
977
|
+
text=(
|
|
978
|
+
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
|
|
979
|
+
f"{html.escape(ann_text)}"
|
|
980
|
+
f"</div>"
|
|
981
|
+
),
|
|
982
|
+
span_source=item,
|
|
983
|
+
)
|
|
984
|
+
results.append(ann_ser_res)
|
|
985
|
+
|
|
986
|
+
text_res = params.caption_delim.join([r.text for r in results])
|
|
987
|
+
if text_res:
|
|
988
|
+
text_res = f"<{tag}>{text_res}</{tag}>"
|
|
958
989
|
return create_ser_result(text=text_res, span_source=results)
|
|
959
990
|
|
|
960
991
|
def _generate_head(self) -> str:
|
|
@@ -29,6 +29,7 @@ from docling_core.transforms.serializer.base import (
|
|
|
29
29
|
from docling_core.transforms.serializer.common import (
|
|
30
30
|
CommonParams,
|
|
31
31
|
DocSerializer,
|
|
32
|
+
_get_picture_annotation_text,
|
|
32
33
|
_PageBreakSerResult,
|
|
33
34
|
create_ser_result,
|
|
34
35
|
)
|
|
@@ -69,6 +70,8 @@ class MarkdownParams(CommonParams):
|
|
|
69
70
|
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
|
|
70
71
|
escape_underscores: bool = True
|
|
71
72
|
escape_html: bool = True
|
|
73
|
+
include_annotations: bool = True
|
|
74
|
+
mark_annotations: bool = False
|
|
72
75
|
|
|
73
76
|
|
|
74
77
|
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
@@ -210,6 +213,24 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
210
213
|
res_parts.append(cap_res)
|
|
211
214
|
|
|
212
215
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
216
|
+
if params.include_annotations:
|
|
217
|
+
|
|
218
|
+
for ann in item.annotations:
|
|
219
|
+
if ann_text := _get_picture_annotation_text(annotation=ann):
|
|
220
|
+
ann_ser_res = create_ser_result(
|
|
221
|
+
text=(
|
|
222
|
+
(
|
|
223
|
+
f'<!--<annotation kind="{ann.kind}">-->'
|
|
224
|
+
f"{ann_text}"
|
|
225
|
+
f"<!--<annotation/>-->"
|
|
226
|
+
)
|
|
227
|
+
if params.mark_annotations
|
|
228
|
+
else ann_text
|
|
229
|
+
),
|
|
230
|
+
span_source=item,
|
|
231
|
+
)
|
|
232
|
+
res_parts.append(ann_ser_res)
|
|
233
|
+
|
|
213
234
|
img_res = self._serialize_image_part(
|
|
214
235
|
item=item,
|
|
215
236
|
doc=doc,
|
|
@@ -395,3 +395,41 @@ class BoundingBox(BaseModel):
|
|
|
395
395
|
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
396
396
|
|
|
397
397
|
return cls(l=left, t=top, r=right, b=bottom, coord_origin=origin)
|
|
398
|
+
|
|
399
|
+
def x_overlap_with(self, other: "BoundingBox") -> float:
|
|
400
|
+
"""Calculates the horizontal overlap with another bounding box."""
|
|
401
|
+
if self.coord_origin != other.coord_origin:
|
|
402
|
+
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
403
|
+
return max(0.0, min(self.r, other.r) - max(self.l, other.l))
|
|
404
|
+
|
|
405
|
+
def y_overlap_with(self, other: "BoundingBox") -> float:
|
|
406
|
+
"""Calculates the vertical overlap with another bounding box, respecting coordinate origin."""
|
|
407
|
+
if self.coord_origin != other.coord_origin:
|
|
408
|
+
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
409
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
410
|
+
return max(0.0, min(self.b, other.b) - max(self.t, other.t))
|
|
411
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
412
|
+
return max(0.0, min(self.t, other.t) - max(self.b, other.b))
|
|
413
|
+
raise ValueError("Unsupported CoordOrigin")
|
|
414
|
+
|
|
415
|
+
def union_area_with(self, other: "BoundingBox") -> float:
|
|
416
|
+
"""Calculates the union area with another bounding box."""
|
|
417
|
+
if self.coord_origin != other.coord_origin:
|
|
418
|
+
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
419
|
+
return self.area() + other.area() - self.intersection_area_with(other)
|
|
420
|
+
|
|
421
|
+
def x_union_with(self, other: "BoundingBox") -> float:
|
|
422
|
+
"""Calculates the horizontal union dimension with another bounding box."""
|
|
423
|
+
if self.coord_origin != other.coord_origin:
|
|
424
|
+
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
425
|
+
return max(0.0, max(self.r, other.r) - min(self.l, other.l))
|
|
426
|
+
|
|
427
|
+
def y_union_with(self, other: "BoundingBox") -> float:
|
|
428
|
+
"""Calculates the vertical union dimension with another bounding box, respecting coordinate origin."""
|
|
429
|
+
if self.coord_origin != other.coord_origin:
|
|
430
|
+
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
431
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
432
|
+
return max(0.0, max(self.b, other.b) - min(self.t, other.t))
|
|
433
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
434
|
+
return max(0.0, max(self.t, other.t) - min(self.b, other.b))
|
|
435
|
+
raise ValueError("Unsupported CoordOrigin")
|
|
@@ -11,6 +11,7 @@ import os
|
|
|
11
11
|
import re
|
|
12
12
|
import sys
|
|
13
13
|
import typing
|
|
14
|
+
import warnings
|
|
14
15
|
from enum import Enum
|
|
15
16
|
from io import BytesIO
|
|
16
17
|
from pathlib import Path
|
|
@@ -2924,6 +2925,7 @@ class DoclingDocument(BaseModel):
|
|
|
2924
2925
|
page_no: Optional[int] = None,
|
|
2925
2926
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2926
2927
|
page_break_placeholder: Optional[str] = None,
|
|
2928
|
+
include_annotations: bool = True,
|
|
2927
2929
|
):
|
|
2928
2930
|
"""Save to markdown."""
|
|
2929
2931
|
if isinstance(filename, str):
|
|
@@ -2951,6 +2953,7 @@ class DoclingDocument(BaseModel):
|
|
|
2951
2953
|
page_no=page_no,
|
|
2952
2954
|
included_content_layers=included_content_layers,
|
|
2953
2955
|
page_break_placeholder=page_break_placeholder,
|
|
2956
|
+
include_annotations=include_annotations,
|
|
2954
2957
|
)
|
|
2955
2958
|
|
|
2956
2959
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -2972,6 +2975,8 @@ class DoclingDocument(BaseModel):
|
|
|
2972
2975
|
page_no: Optional[int] = None,
|
|
2973
2976
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2974
2977
|
page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
|
|
2978
|
+
include_annotations: bool = True,
|
|
2979
|
+
mark_annotations: bool = False,
|
|
2975
2980
|
) -> str:
|
|
2976
2981
|
r"""Serialize to Markdown.
|
|
2977
2982
|
|
|
@@ -2991,9 +2996,9 @@ class DoclingDocument(BaseModel):
|
|
|
2991
2996
|
:type labels: Optional[set[DocItemLabel]] = None
|
|
2992
2997
|
:param strict_text: Deprecated.
|
|
2993
2998
|
:type strict_text: bool = False
|
|
2994
|
-
:param
|
|
2999
|
+
:param escape_underscores: bool: Whether to escape underscores in the
|
|
2995
3000
|
text content of the document. (Default value = True).
|
|
2996
|
-
:type
|
|
3001
|
+
:type escape_underscores: bool = True
|
|
2997
3002
|
:param image_placeholder: The placeholder to include to position
|
|
2998
3003
|
images in the markdown. (Default value = "\<!-- image --\>").
|
|
2999
3004
|
:type image_placeholder: str = "<!-- image -->"
|
|
@@ -3009,6 +3014,12 @@ class DoclingDocument(BaseModel):
|
|
|
3009
3014
|
:param page_break_placeholder: The placeholder to include for marking page
|
|
3010
3015
|
breaks. None means no page break placeholder will be used.
|
|
3011
3016
|
:type page_break_placeholder: Optional[str] = None
|
|
3017
|
+
:param include_annotations: bool: Whether to include annotations in the export.
|
|
3018
|
+
(Default value = True).
|
|
3019
|
+
:type include_annotations: bool = True
|
|
3020
|
+
:param mark_annotations: bool: Whether to mark annotations in the export; only
|
|
3021
|
+
relevant if include_annotations is True. (Default value = False).
|
|
3022
|
+
:type mark_annotations: bool = False
|
|
3012
3023
|
:returns: The exported Markdown representation.
|
|
3013
3024
|
:rtype: str
|
|
3014
3025
|
"""
|
|
@@ -3038,6 +3049,8 @@ class DoclingDocument(BaseModel):
|
|
|
3038
3049
|
indent=indent,
|
|
3039
3050
|
wrap_width=text_width if text_width > 0 else None,
|
|
3040
3051
|
page_break_placeholder=page_break_placeholder,
|
|
3052
|
+
include_annotations=include_annotations,
|
|
3053
|
+
mark_annotations=mark_annotations,
|
|
3041
3054
|
),
|
|
3042
3055
|
)
|
|
3043
3056
|
ser_res = serializer.serialize()
|
|
@@ -3087,6 +3100,7 @@ class DoclingDocument(BaseModel):
|
|
|
3087
3100
|
html_head: str = "null", # should be deprecated
|
|
3088
3101
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3089
3102
|
split_page_view: bool = False,
|
|
3103
|
+
include_annotations: bool = True,
|
|
3090
3104
|
):
|
|
3091
3105
|
"""Save to HTML."""
|
|
3092
3106
|
if isinstance(filename, str):
|
|
@@ -3112,6 +3126,7 @@ class DoclingDocument(BaseModel):
|
|
|
3112
3126
|
html_head=html_head,
|
|
3113
3127
|
included_content_layers=included_content_layers,
|
|
3114
3128
|
split_page_view=split_page_view,
|
|
3129
|
+
include_annotations=include_annotations,
|
|
3115
3130
|
)
|
|
3116
3131
|
|
|
3117
3132
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -3164,6 +3179,7 @@ class DoclingDocument(BaseModel):
|
|
|
3164
3179
|
html_head: str = "null", # should be deprecated ...
|
|
3165
3180
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3166
3181
|
split_page_view: bool = False,
|
|
3182
|
+
include_annotations: bool = True,
|
|
3167
3183
|
) -> str:
|
|
3168
3184
|
r"""Serialize to HTML."""
|
|
3169
3185
|
from docling_core.transforms.serializer.html import (
|
|
@@ -3195,6 +3211,7 @@ class DoclingDocument(BaseModel):
|
|
|
3195
3211
|
html_head=html_head,
|
|
3196
3212
|
html_lang=html_lang,
|
|
3197
3213
|
output_style=output_style,
|
|
3214
|
+
include_annotations=include_annotations,
|
|
3198
3215
|
)
|
|
3199
3216
|
|
|
3200
3217
|
if html_head == "null":
|
|
@@ -4109,7 +4126,10 @@ class DoclingDocument(BaseModel):
|
|
|
4109
4126
|
@classmethod
|
|
4110
4127
|
def validate_document(cls, d: "DoclingDocument"):
|
|
4111
4128
|
"""validate_document."""
|
|
4112
|
-
|
|
4113
|
-
|
|
4129
|
+
with warnings.catch_warnings():
|
|
4130
|
+
# ignore warning from deprecated furniture
|
|
4131
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
4132
|
+
if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
|
|
4133
|
+
raise ValueError("Document hierachy is inconsistent.")
|
|
4114
4134
|
|
|
4115
4135
|
return d
|