docling-core 2.31.2__tar.gz → 2.33.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (109) hide show
  1. docling_core-2.33.0/PKG-INFO +143 -0
  2. docling_core-2.33.0/README.md +99 -0
  3. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/hybrid_chunker.py +7 -8
  4. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +20 -12
  5. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/common.py +23 -6
  6. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/html.py +42 -11
  7. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/markdown.py +21 -0
  8. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/base.py +38 -0
  9. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/document.py +24 -4
  10. docling_core-2.33.0/docling_core.egg-info/PKG-INFO +143 -0
  11. docling_core-2.33.0/docling_core.egg-info/SOURCES.txt +104 -0
  12. docling_core-2.33.0/docling_core.egg-info/dependency_links.txt +1 -0
  13. docling_core-2.33.0/docling_core.egg-info/entry_points.txt +2 -0
  14. docling_core-2.33.0/docling_core.egg-info/requires.txt +18 -0
  15. docling_core-2.33.0/docling_core.egg-info/top_level.txt +1 -0
  16. docling_core-2.33.0/pyproject.toml +157 -0
  17. docling_core-2.33.0/setup.cfg +4 -0
  18. docling_core-2.33.0/test/test_base.py +295 -0
  19. docling_core-2.33.0/test/test_collection.py +158 -0
  20. docling_core-2.33.0/test/test_data_gen_flag.py +9 -0
  21. docling_core-2.33.0/test/test_doc_base.py +45 -0
  22. docling_core-2.33.0/test/test_doc_legacy_convert.py +40 -0
  23. docling_core-2.33.0/test/test_doc_schema.py +147 -0
  24. docling_core-2.33.0/test/test_doc_schema_extractor.py +31 -0
  25. docling_core-2.33.0/test/test_docling_doc.py +1603 -0
  26. docling_core-2.33.0/test/test_doctags_load.py +143 -0
  27. docling_core-2.33.0/test/test_hierarchical_chunker.py +73 -0
  28. docling_core-2.33.0/test/test_hybrid_chunker.py +384 -0
  29. docling_core-2.33.0/test/test_json_schema_to_search_mapper.py +106 -0
  30. docling_core-2.33.0/test/test_nlp_qa.py +46 -0
  31. docling_core-2.33.0/test/test_otsl_table_export.py +284 -0
  32. docling_core-2.33.0/test/test_page.py +79 -0
  33. docling_core-2.33.0/test/test_rec_schema.py +268 -0
  34. docling_core-2.33.0/test/test_search_meta.py +49 -0
  35. docling_core-2.33.0/test/test_serialization.py +372 -0
  36. docling_core-2.33.0/test/test_utils.py +94 -0
  37. docling_core-2.33.0/test/test_visualization.py +42 -0
  38. docling_core-2.31.2/PKG-INFO +0 -143
  39. docling_core-2.31.2/README.md +0 -97
  40. docling_core-2.31.2/pyproject.toml +0 -168
  41. {docling_core-2.31.2 → docling_core-2.33.0}/LICENSE +0 -0
  42. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/__init__.py +0 -0
  43. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/cli/__init__.py +0 -0
  44. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/cli/view.py +0 -0
  45. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/experimental/__init__.py +0 -0
  46. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/py.typed +0 -0
  47. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  48. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  49. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  50. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  51. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  52. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  53. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  54. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  55. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/__init__.py +0 -0
  56. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  57. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/mapping.py +0 -0
  58. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/meta.py +0 -0
  59. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/search/package.py +0 -0
  60. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/__init__.py +0 -0
  61. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/__init__.py +0 -0
  62. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/base.py +0 -0
  63. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  64. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  65. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  66. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  67. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/__init__.py +0 -0
  68. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/base.py +0 -0
  69. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/doctags.py +0 -0
  70. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  71. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  72. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/base.py +0 -0
  73. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  74. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  75. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/__init__.py +0 -0
  76. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/base.py +0 -0
  77. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/__init__.py +0 -0
  78. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/labels.py +0 -0
  79. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/page.py +0 -0
  80. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/tokens.py +0 -0
  81. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/doc/utils.py +0 -0
  82. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/gen/__init__.py +0 -0
  83. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/gen/generic.py +0 -0
  84. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/io/__init__.py +0 -0
  85. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  86. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/base.py +0 -0
  87. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  88. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  89. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  90. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/document.py +0 -0
  91. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  92. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/nlp/__init__.py +0 -0
  93. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/nlp/qa.py +0 -0
  94. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/nlp/qa_labels.py +0 -0
  95. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/__init__.py +0 -0
  96. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/attribute.py +0 -0
  97. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/base.py +0 -0
  98. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/predicate.py +0 -0
  99. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/record.py +0 -0
  100. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/statement.py +0 -0
  101. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/types/rec/subject.py +0 -0
  102. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/__init__.py +0 -0
  103. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/alias.py +0 -0
  104. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/file.py +0 -0
  105. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/generate_docs.py +0 -0
  106. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/generate_jsonschema.py +0 -0
  107. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/legacy.py +0 -0
  108. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/validate.py +0 -0
  109. {docling_core-2.31.2 → docling_core-2.33.0}/docling_core/utils/validators.py +0 -0
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: docling-core
3
+ Version: 2.33.0
4
+ Summary: A python library to define and validate data types in Docling.
5
+ Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
+ Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
7
+ License-Expression: MIT
8
+ Project-URL: homepage, https://github.com/docling-project
9
+ Project-URL: repository, https://github.com/docling-project/docling-core
10
+ Project-URL: issues, https://github.com/docling-project/docling-core/issues
11
+ Project-URL: changelog, https://github.com/docling-project/docling-core/blob/main/CHANGELOG.md
12
+ Keywords: docling,discovery,etl,information retrieval,analytics,database,database schema,schema,JSON
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Natural Language :: English
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Classifier: Typing :: Typed
23
+ Classifier: Programming Language :: Python :: 3
24
+ Requires-Python: <4.0,>=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: jsonschema<5.0.0,>=4.16.0
28
+ Requires-Dist: pydantic!=2.10.0,!=2.10.1,!=2.10.2,<3.0.0,>=2.6.0
29
+ Requires-Dist: jsonref<2.0.0,>=1.1.0
30
+ Requires-Dist: tabulate<0.10.0,>=0.9.0
31
+ Requires-Dist: pandas<3.0.0,>=2.1.4
32
+ Requires-Dist: pillow<12.0.0,>=10.0.0
33
+ Requires-Dist: pyyaml<7.0.0,>=5.1
34
+ Requires-Dist: typing-extensions<5.0.0,>=4.12.2
35
+ Requires-Dist: typer<0.16.0,>=0.12.5
36
+ Requires-Dist: latex2mathml<4.0.0,>=3.77.0
37
+ Provides-Extra: chunking
38
+ Requires-Dist: semchunk<3.0.0,>=2.2.0; extra == "chunking"
39
+ Requires-Dist: transformers<5.0.0,>=4.34.0; extra == "chunking"
40
+ Provides-Extra: chunking-openai
41
+ Requires-Dist: semchunk; extra == "chunking-openai"
42
+ Requires-Dist: tiktoken<0.10.0,>=0.9.0; extra == "chunking-openai"
43
+ Dynamic: license-file
44
+
45
+ # Docling Core
46
+
47
+ [![PyPI version](https://img.shields.io/pypi/v/docling-core)](https://pypi.org/project/docling-core/)
48
+ ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%20%203.11%20%7C%203.12%20%7C%203.13-blue)
49
+ [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
50
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
51
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
52
+ [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/)
53
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
54
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
55
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling-core)](https://opensource.org/licenses/MIT)
56
+
57
+ Docling Core is a library that defines core data types and transformations in [Docling](https://github.com/docling-project/docling).
58
+
59
+ ## Installation
60
+
61
+ To use Docling Core, simply install `docling-core` from your package manager, e.g. pip:
62
+ ```bash
63
+ pip install docling-core
64
+ ```
65
+
66
+ ### Development setup
67
+
68
+ To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and uv. You can then install from your local clone's root dir:
69
+ ```bash
70
+ uv sync --all-extras
71
+ ```
72
+
73
+ To run the pytest suite, execute:
74
+ ```
75
+ uv run pytest -s test
76
+ ```
77
+
78
+ ## Main features
79
+
80
+ Docling Core provides the foundational DoclingDocument data model and API, as well as
81
+ additional APIs for tasks like serialization and chunking, which are key to developing
82
+ generative AI applications using Docling.
83
+
84
+ ### DoclingDocument
85
+
86
+ Docling Core defines the DoclingDocument as a Pydantic model, allowing for advanced
87
+ data model control, customizability, and interoperability.
88
+
89
+ In addition to specifying the schema, it provides a handy API for building documents,
90
+ as well as for basic operations, e.g. exporting to various formats, like Markdown, HTML,
91
+ and others.
92
+
93
+ 👉 More details:
94
+ - [Architecture docs](https://docling-project.github.io/docling/concepts/architecture/)
95
+ - [DoclingDocument docs](https://docling-project.github.io/docling/concepts/docling_document/)
96
+
97
+ ### Serialization
98
+
99
+ Different users can have varying requirements when it comes to serialization.
100
+ To address this, the Serialization API introduces a design that allows easy extension,
101
+ while providing feature-rich built-in implementations (on which the respective
102
+ DoclingDocument helpers are actually based).
103
+
104
+ 👉 More details:
105
+ - [Serialization docs](https://docling-project.github.io/docling/concepts/serialization/)
106
+ - [Serialization example](https://docling-project.github.io/docling/examples/serialization/)
107
+
108
+ ### Chunking
109
+
110
+ Similarly to above, the Chunking API provides built-in chunking capabilities as well as
111
+ a design that enables easy extension, this way tackling customization requirements of
112
+ different use cases.
113
+
114
+ 👉 More details:
115
+ - [Chunking docs](https://docling-project.github.io/docling/concepts/chunking/)
116
+ - [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/)
117
+ - [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/)
118
+
119
+ ## Contributing
120
+
121
+ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
122
+
123
+ ## References
124
+
125
+ If you use Docling Core in your projects, please consider citing the following:
126
+
127
+ ```bib
128
+ @techreport{Docling,
129
+ author = "Deep Search Team",
130
+ month = 8,
131
+ title = "Docling Technical Report",
132
+ url = "https://arxiv.org/abs/2408.09869",
133
+ eprint = "2408.09869",
134
+ doi = "10.48550/arXiv.2408.09869",
135
+ version = "1.0.0",
136
+ year = 2024
137
+ }
138
+ ```
139
+
140
+ ## License
141
+
142
+ The Docling Core codebase is under MIT license.
143
+ For individual model usage, please refer to the model licenses found in the original packages.
@@ -0,0 +1,99 @@
1
+ # Docling Core
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/docling-core)](https://pypi.org/project/docling-core/)
4
+ ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%20%203.11%20%7C%203.12%20%7C%203.13-blue)
5
+ [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
6
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
7
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
8
+ [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/)
9
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
10
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
11
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling-core)](https://opensource.org/licenses/MIT)
12
+
13
+ Docling Core is a library that defines core data types and transformations in [Docling](https://github.com/docling-project/docling).
14
+
15
+ ## Installation
16
+
17
+ To use Docling Core, simply install `docling-core` from your package manager, e.g. pip:
18
+ ```bash
19
+ pip install docling-core
20
+ ```
21
+
22
+ ### Development setup
23
+
24
+ To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and uv. You can then install from your local clone's root dir:
25
+ ```bash
26
+ uv sync --all-extras
27
+ ```
28
+
29
+ To run the pytest suite, execute:
30
+ ```
31
+ uv run pytest -s test
32
+ ```
33
+
34
+ ## Main features
35
+
36
+ Docling Core provides the foundational DoclingDocument data model and API, as well as
37
+ additional APIs for tasks like serialization and chunking, which are key to developing
38
+ generative AI applications using Docling.
39
+
40
+ ### DoclingDocument
41
+
42
+ Docling Core defines the DoclingDocument as a Pydantic model, allowing for advanced
43
+ data model control, customizability, and interoperability.
44
+
45
+ In addition to specifying the schema, it provides a handy API for building documents,
46
+ as well as for basic operations, e.g. exporting to various formats, like Markdown, HTML,
47
+ and others.
48
+
49
+ 👉 More details:
50
+ - [Architecture docs](https://docling-project.github.io/docling/concepts/architecture/)
51
+ - [DoclingDocument docs](https://docling-project.github.io/docling/concepts/docling_document/)
52
+
53
+ ### Serialization
54
+
55
+ Different users can have varying requirements when it comes to serialization.
56
+ To address this, the Serialization API introduces a design that allows easy extension,
57
+ while providing feature-rich built-in implementations (on which the respective
58
+ DoclingDocument helpers are actually based).
59
+
60
+ 👉 More details:
61
+ - [Serialization docs](https://docling-project.github.io/docling/concepts/serialization/)
62
+ - [Serialization example](https://docling-project.github.io/docling/examples/serialization/)
63
+
64
+ ### Chunking
65
+
66
+ Similarly to above, the Chunking API provides built-in chunking capabilities as well as
67
+ a design that enables easy extension, this way tackling customization requirements of
68
+ different use cases.
69
+
70
+ 👉 More details:
71
+ - [Chunking docs](https://docling-project.github.io/docling/concepts/chunking/)
72
+ - [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/)
73
+ - [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/)
74
+
75
+ ## Contributing
76
+
77
+ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
78
+
79
+ ## References
80
+
81
+ If you use Docling Core in your projects, please consider citing the following:
82
+
83
+ ```bib
84
+ @techreport{Docling,
85
+ author = "Deep Search Team",
86
+ month = 8,
87
+ title = "Docling Technical Report",
88
+ url = "https://arxiv.org/abs/2408.09869",
89
+ eprint = "2408.09869",
90
+ doi = "10.48550/arXiv.2408.09869",
91
+ version = "1.0.0",
92
+ year = 2024
93
+ }
94
+ ```
95
+
96
+ ## License
97
+
98
+ The Docling Core codebase is under MIT license.
99
+ For individual model usage, please refer to the model licenses found in the original packages.
@@ -88,7 +88,6 @@ class HybridChunker(BaseChunker):
88
88
  "For updated usage check out "
89
89
  "https://docling-project.github.io/docling/examples/hybrid_chunking/",
90
90
  DeprecationWarning,
91
- stacklevel=3,
92
91
  )
93
92
 
94
93
  if isinstance(tokenizer, str):
@@ -156,7 +155,6 @@ class HybridChunker(BaseChunker):
156
155
  meta = DocMeta(
157
156
  doc_items=doc_items,
158
157
  headings=doc_chunk.meta.headings,
159
- captions=doc_chunk.meta.captions,
160
158
  origin=doc_chunk.meta.origin,
161
159
  )
162
160
  window_text = (
@@ -235,7 +233,9 @@ class HybridChunker(BaseChunker):
235
233
  )
236
234
  if available_length <= 0:
237
235
  warnings.warn(
238
- f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}" # noqa
236
+ "Headers and captions for this chunk are longer than the total "
237
+ "amount of size for the chunk, chunk will be ignored: "
238
+ f"{doc_chunk.text=}"
239
239
  )
240
240
  return []
241
241
  text = doc_chunk.text
@@ -250,10 +250,10 @@ class HybridChunker(BaseChunker):
250
250
  num_chunks = len(chunks)
251
251
  while window_end < num_chunks:
252
252
  chunk = chunks[window_end]
253
- headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
253
+ headings = chunk.meta.headings
254
254
  ready_to_append = False
255
255
  if window_start == window_end:
256
- current_headings_and_captions = headings_and_captions
256
+ current_headings = headings
257
257
  window_end += 1
258
258
  first_chunk_of_window = chunk
259
259
  else:
@@ -264,13 +264,12 @@ class HybridChunker(BaseChunker):
264
264
  text=self.delim.join([chk.text for chk in chks]),
265
265
  meta=DocMeta(
266
266
  doc_items=doc_items,
267
- headings=current_headings_and_captions[0],
268
- captions=current_headings_and_captions[1],
267
+ headings=current_headings,
269
268
  origin=chunk.meta.origin,
270
269
  ),
271
270
  )
272
271
  if (
273
- headings_and_captions == current_headings_and_captions
272
+ headings == current_headings
274
273
  and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
275
274
  ):
276
275
  # there is room to include the new chunk so add it to the window and
@@ -1,10 +1,11 @@
1
1
  """HuggingFace tokenization."""
2
2
 
3
- import sys
3
+ import json
4
4
  from os import PathLike
5
5
  from typing import Optional, Union
6
6
 
7
- from pydantic import ConfigDict, PositiveInt, TypeAdapter, model_validator
7
+ from huggingface_hub import hf_hub_download
8
+ from pydantic import ConfigDict, model_validator
8
9
  from typing_extensions import Self
9
10
 
10
11
  from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
@@ -28,16 +29,23 @@ class HuggingFaceTokenizer(BaseTokenizer):
28
29
 
29
30
  @model_validator(mode="after")
30
31
  def _patch(self) -> Self:
31
- if hasattr(self.tokenizer, "model_max_length"):
32
- model_max_tokens: PositiveInt = TypeAdapter(PositiveInt).validate_python(
33
- self.tokenizer.model_max_length
34
- )
35
- user_max_tokens = self.max_tokens or sys.maxsize
36
- self.max_tokens = min(model_max_tokens, user_max_tokens)
37
- elif self.max_tokens is None:
38
- raise ValueError(
39
- "max_tokens must be defined as model does not define model_max_length"
40
- )
32
+ if self.max_tokens is None:
33
+ try:
34
+ # try to use SentenceTransformers-specific config as that seems to be
35
+ # reliable (whenever available)
36
+ config_name = "sentence_bert_config.json"
37
+ config_path = hf_hub_download(
38
+ repo_id=self.tokenizer.name_or_path,
39
+ filename=config_name,
40
+ )
41
+ with open(config_path) as f:
42
+ data = json.load(f)
43
+ self.max_tokens = int(data["max_seq_length"])
44
+ except Exception as e:
45
+ raise RuntimeError(
46
+ "max_tokens could not be determined automatically; please set "
47
+ "explicitly."
48
+ ) from e
41
49
  return self
42
50
 
43
51
  def count_tokens(self, text: str):
@@ -11,7 +11,7 @@ from functools import cached_property
11
11
  from pathlib import Path
12
12
  from typing import Any, Iterable, Optional, Tuple, Union
13
13
 
14
- from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
14
+ from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_field
15
15
  from typing_extensions import Self, override
16
16
 
17
17
  from docling_core.transforms.serializer.base import (
@@ -39,7 +39,11 @@ from docling_core.types.doc.document import (
39
39
  KeyValueItem,
40
40
  NodeItem,
41
41
  OrderedList,
42
+ PictureClassificationData,
43
+ PictureDataType,
44
+ PictureDescriptionData,
42
45
  PictureItem,
46
+ PictureMoleculeData,
43
47
  TableItem,
44
48
  TextItem,
45
49
  UnorderedList,
@@ -118,6 +122,23 @@ def _iterate_items(
118
122
  yield item
119
123
 
120
124
 
125
+ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
126
+ result = None
127
+ if isinstance(annotation, PictureClassificationData):
128
+ predicted_class = (
129
+ annotation.predicted_classes[0].class_name
130
+ if annotation.predicted_classes
131
+ else None
132
+ )
133
+ if predicted_class is not None:
134
+ result = predicted_class.replace("_", " ")
135
+ elif isinstance(annotation, PictureDescriptionData):
136
+ result = annotation.text
137
+ elif isinstance(annotation, PictureMoleculeData):
138
+ result = annotation.smi
139
+ return result
140
+
141
+
121
142
  def create_ser_result(
122
143
  *,
123
144
  text: str = "",
@@ -176,11 +197,7 @@ class CommonParams(BaseModel):
176
197
  class DocSerializer(BaseModel, BaseDocSerializer):
177
198
  """Class for document serializers."""
178
199
 
179
- class Config:
180
- """Pydantic config."""
181
-
182
- arbitrary_types_allowed = True
183
- extra = "forbid"
200
+ model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
184
201
 
185
202
  doc: DoclingDocument
186
203
 
@@ -35,6 +35,7 @@ from docling_core.transforms.serializer.base import (
35
35
  from docling_core.transforms.serializer.common import (
36
36
  CommonParams,
37
37
  DocSerializer,
38
+ _get_picture_annotation_text,
38
39
  create_ser_result,
39
40
  )
40
41
  from docling_core.transforms.serializer.html_styles import (
@@ -110,6 +111,8 @@ class HTMLParams(CommonParams):
110
111
  # Enable charts to be printed into HTML as tables
111
112
  enable_chart_tables: bool = True
112
113
 
114
+ include_annotations: bool = True
115
+
113
116
 
114
117
  class HTMLTextSerializer(BaseModel, BaseTextSerializer):
115
118
  """HTML-specific text item serializer."""
@@ -943,18 +946,46 @@ class HTMLDocSerializer(DocSerializer):
943
946
  params = self.params.merge_with_patch(patch=kwargs)
944
947
  results: list[SerializationResult] = []
945
948
  text_res = ""
949
+ excluded_refs = self.get_excluded_refs(**kwargs)
950
+
946
951
  if DocItemLabel.CAPTION in params.labels:
947
- results = [
948
- create_ser_result(text=it.text, span_source=it)
949
- for cap in item.captions
950
- if isinstance(it := cap.resolve(self.doc), TextItem)
951
- and it.self_ref not in self.get_excluded_refs(**kwargs)
952
- ]
953
- text_res = params.caption_delim.join([r.text for r in results])
954
- if text_res:
955
- text_dir = get_text_direction(text_res)
956
- dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
957
- text_res = f"<{tag}{dir_str}>{html.escape(text_res)}</{tag}>"
952
+ for cap in item.captions:
953
+ if (
954
+ isinstance(it := cap.resolve(self.doc), TextItem)
955
+ and it.self_ref not in excluded_refs
956
+ ):
957
+ text_cap = it.text
958
+ text_dir = get_text_direction(text_cap)
959
+ dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
960
+ cap_ser_res = create_ser_result(
961
+ text=(
962
+ f'<div class="caption"{dir_str}>'
963
+ f"{html.escape(text_cap)}"
964
+ f"</div>"
965
+ ),
966
+ span_source=it,
967
+ )
968
+ results.append(cap_ser_res)
969
+
970
+ if params.include_annotations and item.self_ref not in excluded_refs:
971
+ if isinstance(item, PictureItem):
972
+ for ann in item.annotations:
973
+ if ann_text := _get_picture_annotation_text(annotation=ann):
974
+ text_dir = get_text_direction(ann_text)
975
+ dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
976
+ ann_ser_res = create_ser_result(
977
+ text=(
978
+ f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
979
+ f"{html.escape(ann_text)}"
980
+ f"</div>"
981
+ ),
982
+ span_source=item,
983
+ )
984
+ results.append(ann_ser_res)
985
+
986
+ text_res = params.caption_delim.join([r.text for r in results])
987
+ if text_res:
988
+ text_res = f"<{tag}>{text_res}</{tag}>"
958
989
  return create_ser_result(text=text_res, span_source=results)
959
990
 
960
991
  def _generate_head(self) -> str:
@@ -29,6 +29,7 @@ from docling_core.transforms.serializer.base import (
29
29
  from docling_core.transforms.serializer.common import (
30
30
  CommonParams,
31
31
  DocSerializer,
32
+ _get_picture_annotation_text,
32
33
  _PageBreakSerResult,
33
34
  create_ser_result,
34
35
  )
@@ -69,6 +70,8 @@ class MarkdownParams(CommonParams):
69
70
  page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
70
71
  escape_underscores: bool = True
71
72
  escape_html: bool = True
73
+ include_annotations: bool = True
74
+ mark_annotations: bool = False
72
75
 
73
76
 
74
77
  class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
@@ -210,6 +213,24 @@ class MarkdownPictureSerializer(BasePictureSerializer):
210
213
  res_parts.append(cap_res)
211
214
 
212
215
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
216
+ if params.include_annotations:
217
+
218
+ for ann in item.annotations:
219
+ if ann_text := _get_picture_annotation_text(annotation=ann):
220
+ ann_ser_res = create_ser_result(
221
+ text=(
222
+ (
223
+ f'<!--<annotation kind="{ann.kind}">-->'
224
+ f"{ann_text}"
225
+ f"<!--<annotation/>-->"
226
+ )
227
+ if params.mark_annotations
228
+ else ann_text
229
+ ),
230
+ span_source=item,
231
+ )
232
+ res_parts.append(ann_ser_res)
233
+
213
234
  img_res = self._serialize_image_part(
214
235
  item=item,
215
236
  doc=doc,
@@ -395,3 +395,41 @@ class BoundingBox(BaseModel):
395
395
  raise ValueError("BoundingBoxes have different CoordOrigin")
396
396
 
397
397
  return cls(l=left, t=top, r=right, b=bottom, coord_origin=origin)
398
+
399
+ def x_overlap_with(self, other: "BoundingBox") -> float:
400
+ """Calculates the horizontal overlap with another bounding box."""
401
+ if self.coord_origin != other.coord_origin:
402
+ raise ValueError("BoundingBoxes have different CoordOrigin")
403
+ return max(0.0, min(self.r, other.r) - max(self.l, other.l))
404
+
405
+ def y_overlap_with(self, other: "BoundingBox") -> float:
406
+ """Calculates the vertical overlap with another bounding box, respecting coordinate origin."""
407
+ if self.coord_origin != other.coord_origin:
408
+ raise ValueError("BoundingBoxes have different CoordOrigin")
409
+ if self.coord_origin == CoordOrigin.TOPLEFT:
410
+ return max(0.0, min(self.b, other.b) - max(self.t, other.t))
411
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
412
+ return max(0.0, min(self.t, other.t) - max(self.b, other.b))
413
+ raise ValueError("Unsupported CoordOrigin")
414
+
415
+ def union_area_with(self, other: "BoundingBox") -> float:
416
+ """Calculates the union area with another bounding box."""
417
+ if self.coord_origin != other.coord_origin:
418
+ raise ValueError("BoundingBoxes have different CoordOrigin")
419
+ return self.area() + other.area() - self.intersection_area_with(other)
420
+
421
+ def x_union_with(self, other: "BoundingBox") -> float:
422
+ """Calculates the horizontal union dimension with another bounding box."""
423
+ if self.coord_origin != other.coord_origin:
424
+ raise ValueError("BoundingBoxes have different CoordOrigin")
425
+ return max(0.0, max(self.r, other.r) - min(self.l, other.l))
426
+
427
+ def y_union_with(self, other: "BoundingBox") -> float:
428
+ """Calculates the vertical union dimension with another bounding box, respecting coordinate origin."""
429
+ if self.coord_origin != other.coord_origin:
430
+ raise ValueError("BoundingBoxes have different CoordOrigin")
431
+ if self.coord_origin == CoordOrigin.TOPLEFT:
432
+ return max(0.0, max(self.b, other.b) - min(self.t, other.t))
433
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
434
+ return max(0.0, max(self.t, other.t) - min(self.b, other.b))
435
+ raise ValueError("Unsupported CoordOrigin")
@@ -11,6 +11,7 @@ import os
11
11
  import re
12
12
  import sys
13
13
  import typing
14
+ import warnings
14
15
  from enum import Enum
15
16
  from io import BytesIO
16
17
  from pathlib import Path
@@ -2924,6 +2925,7 @@ class DoclingDocument(BaseModel):
2924
2925
  page_no: Optional[int] = None,
2925
2926
  included_content_layers: Optional[set[ContentLayer]] = None,
2926
2927
  page_break_placeholder: Optional[str] = None,
2928
+ include_annotations: bool = True,
2927
2929
  ):
2928
2930
  """Save to markdown."""
2929
2931
  if isinstance(filename, str):
@@ -2951,6 +2953,7 @@ class DoclingDocument(BaseModel):
2951
2953
  page_no=page_no,
2952
2954
  included_content_layers=included_content_layers,
2953
2955
  page_break_placeholder=page_break_placeholder,
2956
+ include_annotations=include_annotations,
2954
2957
  )
2955
2958
 
2956
2959
  with open(filename, "w", encoding="utf-8") as fw:
@@ -2972,6 +2975,8 @@ class DoclingDocument(BaseModel):
2972
2975
  page_no: Optional[int] = None,
2973
2976
  included_content_layers: Optional[set[ContentLayer]] = None,
2974
2977
  page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
2978
+ include_annotations: bool = True,
2979
+ mark_annotations: bool = False,
2975
2980
  ) -> str:
2976
2981
  r"""Serialize to Markdown.
2977
2982
 
@@ -2991,9 +2996,9 @@ class DoclingDocument(BaseModel):
2991
2996
  :type labels: Optional[set[DocItemLabel]] = None
2992
2997
  :param strict_text: Deprecated.
2993
2998
  :type strict_text: bool = False
2994
- :param escaping_underscores: bool: Whether to escape underscores in the
2999
+ :param escape_underscores: bool: Whether to escape underscores in the
2995
3000
  text content of the document. (Default value = True).
2996
- :type escaping_underscores: bool = True
3001
+ :type escape_underscores: bool = True
2997
3002
  :param image_placeholder: The placeholder to include to position
2998
3003
  images in the markdown. (Default value = "\<!-- image --\>").
2999
3004
  :type image_placeholder: str = "<!-- image -->"
@@ -3009,6 +3014,12 @@ class DoclingDocument(BaseModel):
3009
3014
  :param page_break_placeholder: The placeholder to include for marking page
3010
3015
  breaks. None means no page break placeholder will be used.
3011
3016
  :type page_break_placeholder: Optional[str] = None
3017
+ :param include_annotations: bool: Whether to include annotations in the export.
3018
+ (Default value = True).
3019
+ :type include_annotations: bool = True
3020
+ :param mark_annotations: bool: Whether to mark annotations in the export; only
3021
+ relevant if include_annotations is True. (Default value = False).
3022
+ :type mark_annotations: bool = False
3012
3023
  :returns: The exported Markdown representation.
3013
3024
  :rtype: str
3014
3025
  """
@@ -3038,6 +3049,8 @@ class DoclingDocument(BaseModel):
3038
3049
  indent=indent,
3039
3050
  wrap_width=text_width if text_width > 0 else None,
3040
3051
  page_break_placeholder=page_break_placeholder,
3052
+ include_annotations=include_annotations,
3053
+ mark_annotations=mark_annotations,
3041
3054
  ),
3042
3055
  )
3043
3056
  ser_res = serializer.serialize()
@@ -3087,6 +3100,7 @@ class DoclingDocument(BaseModel):
3087
3100
  html_head: str = "null", # should be deprecated
3088
3101
  included_content_layers: Optional[set[ContentLayer]] = None,
3089
3102
  split_page_view: bool = False,
3103
+ include_annotations: bool = True,
3090
3104
  ):
3091
3105
  """Save to HTML."""
3092
3106
  if isinstance(filename, str):
@@ -3112,6 +3126,7 @@ class DoclingDocument(BaseModel):
3112
3126
  html_head=html_head,
3113
3127
  included_content_layers=included_content_layers,
3114
3128
  split_page_view=split_page_view,
3129
+ include_annotations=include_annotations,
3115
3130
  )
3116
3131
 
3117
3132
  with open(filename, "w", encoding="utf-8") as fw:
@@ -3164,6 +3179,7 @@ class DoclingDocument(BaseModel):
3164
3179
  html_head: str = "null", # should be deprecated ...
3165
3180
  included_content_layers: Optional[set[ContentLayer]] = None,
3166
3181
  split_page_view: bool = False,
3182
+ include_annotations: bool = True,
3167
3183
  ) -> str:
3168
3184
  r"""Serialize to HTML."""
3169
3185
  from docling_core.transforms.serializer.html import (
@@ -3195,6 +3211,7 @@ class DoclingDocument(BaseModel):
3195
3211
  html_head=html_head,
3196
3212
  html_lang=html_lang,
3197
3213
  output_style=output_style,
3214
+ include_annotations=include_annotations,
3198
3215
  )
3199
3216
 
3200
3217
  if html_head == "null":
@@ -4109,7 +4126,10 @@ class DoclingDocument(BaseModel):
4109
4126
  @classmethod
4110
4127
  def validate_document(cls, d: "DoclingDocument"):
4111
4128
  """validate_document."""
4112
- if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
4113
- raise ValueError("Document hierachy is inconsistent.")
4129
+ with warnings.catch_warnings():
4130
+ # ignore warning from deprecated furniture
4131
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
4132
+ if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
4133
+ raise ValueError("Document hierachy is inconsistent.")
4114
4134
 
4115
4135
  return d